{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.00525,
  "eval_steps": 500,
  "global_step": 525,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 8.0,
      "completions/max_terminated_length": 8.0,
      "completions/mean_length": 2.6875,
      "completions/mean_terminated_length": 2.6875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.7784842923283577,
      "epoch": 1e-05,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "kl": 0.0,
      "learning_rate": 0.0,
      "loss": 0.0,
      "num_tokens": 45317.0,
      "reward": 2.1999998092651367,
      "reward_std": 1.7413380146026611,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.75,
      "rewards/probe_completion_length/std": 1.4142135381698608,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.6395175457000732,
      "sampling/importance_sampling_ratio/mean": 0.8939619064331055,
      "sampling/importance_sampling_ratio/min": 0.38482141494750977,
      "sampling/sampling_logp_difference/max": 0.7602746486663818,
      "sampling/sampling_logp_difference/mean": 0.08076699078083038,
      "step": 1,
      "step_time": 11.14861279899992
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.7784842923283577,
      "epoch": 2e-05,
      "grad_norm": 0.0,
      "kl": 0.0,
      "learning_rate": 2.2857142857142855e-07,
      "loss": 0.0,
      "step": 2,
      "step_time": 6.046958927999867
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.3125,
      "completions/mean_terminated_length": 2.3125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.0997111424803734,
      "epoch": 3e-05,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "kl": 0.0,
      "learning_rate": 4.571428571428571e-07,
      "loss": 0.0,
      "num_tokens": 95616.0,
      "reward": 1.2625000476837158,
      "reward_std": 0.4709290862083435,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.3125,
      "rewards/probe_completion_length/std": 0.4709290862083435,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.11922025680542,
      "sampling/importance_sampling_ratio/mean": 0.8232259750366211,
      "sampling/importance_sampling_ratio/min": 0.467467337846756,
      "sampling/sampling_logp_difference/max": 0.4922431409358978,
      "sampling/sampling_logp_difference/mean": 0.10929661989212036,
      "step": 3,
      "step_time": 11.024081129000137
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.0997111424803734,
      "epoch": 4e-05,
      "grad_norm": 0.0,
      "kl": 0.0,
      "learning_rate": 6.857142857142857e-07,
      "loss": 0.0,
      "step": 4,
      "step_time": 6.094397290000188
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.46875,
      "completions/mean_terminated_length": 2.46875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8431090712547302,
      "epoch": 5e-05,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "kl": 0.0,
      "learning_rate": 9.142857142857142e-07,
      "loss": 0.0,
      "num_tokens": 143386.0,
      "reward": 1.4187500476837158,
      "reward_std": 0.507007360458374,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.46875,
      "rewards/probe_completion_length/std": 0.507007360458374,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.5770914554595947,
      "sampling/importance_sampling_ratio/mean": 0.8894044160842896,
      "sampling/importance_sampling_ratio/min": 0.43558987975120544,
      "sampling/sampling_logp_difference/max": 0.7718255519866943,
      "sampling/sampling_logp_difference/mean": 0.07799207419157028,
      "step": 5,
      "step_time": 10.487163771999917
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8431090712547302,
      "epoch": 6e-05,
      "grad_norm": 0.0,
      "kl": 0.0,
      "learning_rate": 1.1428571428571428e-06,
      "loss": 0.0,
      "step": 6,
      "step_time": 6.185672804999967
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.59375,
      "completions/mean_terminated_length": 2.59375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.6589353680610657,
      "epoch": 7e-05,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "kl": 0.0,
      "learning_rate": 1.3714285714285715e-06,
      "loss": 0.0,
      "num_tokens": 188768.0,
      "reward": 2.043750047683716,
      "reward_std": 1.201058030128479,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.59375,
      "rewards/probe_completion_length/std": 0.49899089336395264,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.1102030277252197,
      "sampling/importance_sampling_ratio/mean": 0.9080963134765625,
      "sampling/importance_sampling_ratio/min": 0.7015503644943237,
      "sampling/sampling_logp_difference/max": 0.22858071327209473,
      "sampling/sampling_logp_difference/mean": 0.05401453375816345,
      "step": 7,
      "step_time": 11.051496965999718
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.6589353680610657,
      "epoch": 8e-05,
      "grad_norm": 0.0,
      "kl": 0.0,
      "learning_rate": 1.6e-06,
      "loss": 0.0,
      "step": 8,
      "step_time": 5.989734975000033
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5625,
      "completions/mean_terminated_length": 2.5625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8500039726495743,
      "epoch": 9e-05,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "kl": 0.0,
      "learning_rate": 1.8285714285714284e-06,
      "loss": 0.0,
      "num_tokens": 243357.0,
      "reward": 1.5125000476837158,
      "reward_std": 0.504016101360321,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5625,
      "rewards/probe_completion_length/std": 0.504016101360321,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 2.34220027923584,
      "sampling/importance_sampling_ratio/mean": 0.8937994241714478,
      "sampling/importance_sampling_ratio/min": 0.5326315760612488,
      "sampling/sampling_logp_difference/max": 0.925993800163269,
      "sampling/sampling_logp_difference/mean": 0.0961286872625351,
      "step": 9,
      "step_time": 10.82308242299996
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8500039726495743,
      "epoch": 0.0001,
      "grad_norm": 0.0,
      "kl": 0.0,
      "learning_rate": 2.057142857142857e-06,
      "loss": 0.0,
      "step": 10,
      "step_time": 6.345786435000036
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.78125,
      "completions/mean_terminated_length": 2.78125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.5993861705064774,
      "epoch": 0.00011,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "kl": 0.0,
      "learning_rate": 2.2857142857142856e-06,
      "loss": 0.0,
      "num_tokens": 292572.0,
      "reward": 2.231250047683716,
      "reward_std": 1.0846248865127563,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.78125,
      "rewards/probe_completion_length/std": 0.420013427734375,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.206714153289795,
      "sampling/importance_sampling_ratio/mean": 0.9141116738319397,
      "sampling/importance_sampling_ratio/min": 0.6659113764762878,
      "sampling/sampling_logp_difference/max": 0.2911492586135864,
      "sampling/sampling_logp_difference/mean": 0.05162158980965614,
      "step": 11,
      "step_time": 10.827366451999865
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.5993861705064774,
      "epoch": 0.00012,
      "grad_norm": 0.0,
      "kl": 0.0,
      "learning_rate": 2.5142857142857142e-06,
      "loss": 0.0,
      "step": 12,
      "step_time": 5.958845550000092
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.59375,
      "completions/mean_terminated_length": 2.59375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.7044863402843475,
      "epoch": 0.00013,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.000793688406702131,
      "kl": 0.0,
      "learning_rate": 2.742857142857143e-06,
      "loss": 0.0,
      "num_tokens": 340626.0,
      "reward": 1.6062500476837158,
      "reward_std": 0.6530017852783203,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.59375,
      "rewards/probe_completion_length/std": 0.49899089336395264,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.96875,
      "rewards/probe_shaping_dominance/std": 0.1767766922712326,
      "rewards/probe_terminal_raw/mean": 0.03125,
      "rewards/probe_terminal_raw/std": 0.1767766922712326,
      "rewards/rollout_reward_func/mean": -0.9375,
      "rewards/rollout_reward_func/std": 0.3535533845424652,
      "sampling/importance_sampling_ratio/max": 1.0612809658050537,
      "sampling/importance_sampling_ratio/mean": 0.8982734680175781,
      "sampling/importance_sampling_ratio/min": 0.45619186758995056,
      "sampling/sampling_logp_difference/max": 0.5952367782592773,
      "sampling/sampling_logp_difference/mean": 0.05711527168750763,
      "step": 13,
      "step_time": 10.020639281000172
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.7065823003649712,
      "epoch": 0.00014,
      "grad_norm": 0.00079762825043872,
      "kl": 0.000474392608339258,
      "learning_rate": 2.9714285714285716e-06,
      "loss": -0.0,
      "step": 14,
      "step_time": 6.576750468
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 8.0,
      "completions/max_terminated_length": 8.0,
      "completions/mean_length": 3.15625,
      "completions/mean_terminated_length": 3.15625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.6735320538282394,
      "epoch": 0.00015,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.006771796382963657,
      "kl": 0.004827667989957263,
      "learning_rate": 3.2e-06,
      "loss": 0.0,
      "num_tokens": 381093.0,
      "reward": 2.6062498092651367,
      "reward_std": 1.4280457496643066,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 2.21875,
      "rewards/probe_completion_length/std": 1.2374368906021118,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.78125,
      "rewards/probe_shaping_dominance/std": 0.420013427734375,
      "rewards/probe_terminal_raw/mean": 0.21875,
      "rewards/probe_terminal_raw/std": 0.420013427734375,
      "rewards/rollout_reward_func/mean": -0.5625,
      "rewards/rollout_reward_func/std": 0.84002685546875,
      "sampling/importance_sampling_ratio/max": 1.2053685188293457,
      "sampling/importance_sampling_ratio/mean": 0.890531063079834,
      "sampling/importance_sampling_ratio/min": 0.5232845544815063,
      "sampling/sampling_logp_difference/max": 0.4864621162414551,
      "sampling/sampling_logp_difference/mean": 0.06389547139406204,
      "step": 15,
      "step_time": 10.024728831000175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.671803779900074,
      "epoch": 0.00016,
      "grad_norm": 0.005499588791280985,
      "kl": 0.008253770578448894,
      "learning_rate": 3.428571428571428e-06,
      "loss": -0.0,
      "step": 16,
      "step_time": 5.566490591999809
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.625,
      "completions/mean_terminated_length": 2.625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8892305418848991,
      "epoch": 0.00017,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.003721765475347638,
      "kl": 0.0053854092129768105,
      "learning_rate": 3.657142857142857e-06,
      "loss": -0.0,
      "num_tokens": 424900.0,
      "reward": 1.7000000476837158,
      "reward_std": 0.7620007395744324,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.625,
      "rewards/probe_completion_length/std": 0.49186936020851135,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9375,
      "rewards/probe_shaping_dominance/std": 0.24593468010425568,
      "rewards/probe_terminal_raw/mean": 0.0625,
      "rewards/probe_terminal_raw/std": 0.24593468010425568,
      "rewards/rollout_reward_func/mean": -0.875,
      "rewards/rollout_reward_func/std": 0.49186936020851135,
      "sampling/importance_sampling_ratio/max": 0.9796642661094666,
      "sampling/importance_sampling_ratio/mean": 0.8644938468933105,
      "sampling/importance_sampling_ratio/min": 0.5730080008506775,
      "sampling/sampling_logp_difference/max": 0.48313581943511963,
      "sampling/sampling_logp_difference/mean": 0.06963828206062317,
      "step": 17,
      "step_time": 9.886864271999912
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8878831416368484,
      "epoch": 0.00018,
      "grad_norm": 0.0030290314462035894,
      "kl": 0.013682284701644676,
      "learning_rate": 3.885714285714286e-06,
      "loss": -0.0,
      "step": 18,
      "step_time": 6.536783615000104
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5,
      "completions/mean_terminated_length": 2.5,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8332870826125145,
      "epoch": 0.00019,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 4.497891859500669e-05,
      "kl": 0.0054511257992544415,
      "learning_rate": 4.114285714285714e-06,
      "loss": 0.0,
      "num_tokens": 474441.0,
      "reward": 1.9500000476837158,
      "reward_std": 1.2443419694900513,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5,
      "rewards/probe_completion_length/std": 0.5080004930496216,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.0432957410812378,
      "sampling/importance_sampling_ratio/mean": 0.8474187850952148,
      "sampling/importance_sampling_ratio/min": 0.3696611821651459,
      "sampling/sampling_logp_difference/max": 1.1028695106506348,
      "sampling/sampling_logp_difference/mean": 0.0895814448595047,
      "step": 19,
      "step_time": 10.478061494000144
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8406587913632393,
      "epoch": 0.0002,
      "grad_norm": 3.429582648095675e-05,
      "kl": 0.004676993812608998,
      "learning_rate": 4.342857142857142e-06,
      "loss": 0.0,
      "step": 20,
      "step_time": 6.040967968000018
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5,
      "completions/mean_terminated_length": 2.5,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8806165009737015,
      "epoch": 0.00021,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00014957028906792402,
      "kl": 0.0227611528680427,
      "learning_rate": 4.571428571428571e-06,
      "loss": 0.0,
      "num_tokens": 522511.0,
      "reward": 1.4500000476837158,
      "reward_std": 0.5080004930496216,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5,
      "rewards/probe_completion_length/std": 0.5080004930496216,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.8144991397857666,
      "sampling/importance_sampling_ratio/mean": 0.9702324867248535,
      "sampling/importance_sampling_ratio/min": 0.36904460191726685,
      "sampling/sampling_logp_difference/max": 0.860051155090332,
      "sampling/sampling_logp_difference/mean": 0.10725787281990051,
      "step": 21,
      "step_time": 10.022902672999862
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8847335204482079,
      "epoch": 0.00022,
      "grad_norm": 4.064434324391186e-05,
      "kl": 0.00797901525220368,
      "learning_rate": 4.8e-06,
      "loss": 0.0,
      "step": 22,
      "step_time": 6.566884732000062
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5625,
      "completions/mean_terminated_length": 2.5625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9157858863472939,
      "epoch": 0.00023,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0012762682745233178,
      "kl": 0.006473740606452338,
      "learning_rate": 5.0285714285714285e-06,
      "loss": -0.0,
      "num_tokens": 573663.0,
      "reward": 1.8250000476837158,
      "reward_std": 1.0395408868789673,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5625,
      "rewards/probe_completion_length/std": 0.504016101360321,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.84375,
      "rewards/probe_shaping_dominance/std": 0.3689020276069641,
      "rewards/probe_terminal_raw/mean": 0.15625,
      "rewards/probe_terminal_raw/std": 0.3689020276069641,
      "rewards/rollout_reward_func/mean": -0.6875,
      "rewards/rollout_reward_func/std": 0.7378040552139282,
      "sampling/importance_sampling_ratio/max": 1.4042459726333618,
      "sampling/importance_sampling_ratio/mean": 0.8917810320854187,
      "sampling/importance_sampling_ratio/min": 0.38385266065597534,
      "sampling/sampling_logp_difference/max": 0.6875017881393433,
      "sampling/sampling_logp_difference/mean": 0.09778842329978943,
      "step": 23,
      "step_time": 10.025745644999915
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.9124525971710682,
      "epoch": 0.00024,
      "grad_norm": 0.0014905633870512247,
      "kl": 0.008495110145304352,
      "learning_rate": 5.257142857142857e-06,
      "loss": -0.0,
      "step": 24,
      "step_time": 6.059164398000007
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.46875,
      "completions/mean_terminated_length": 2.46875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8142954409122467,
      "epoch": 0.00025,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 2.238359047623817e-05,
      "kl": 0.008038407871936215,
      "learning_rate": 5.485714285714286e-06,
      "loss": 0.0,
      "num_tokens": 619642.0,
      "reward": 1.9187500476837158,
      "reward_std": 1.256836175918579,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.46875,
      "rewards/probe_completion_length/std": 0.507007360458374,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.6296800374984741,
      "sampling/importance_sampling_ratio/mean": 0.9253418445587158,
      "sampling/importance_sampling_ratio/min": 0.5489835143089294,
      "sampling/sampling_logp_difference/max": 0.5560274124145508,
      "sampling/sampling_logp_difference/mean": 0.07420844584703445,
      "step": 25,
      "step_time": 9.907065097999862
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8018932789564133,
      "epoch": 0.00026,
      "grad_norm": 2.5122495571849868e-05,
      "kl": 0.0032655789345881203,
      "learning_rate": 5.7142857142857145e-06,
      "loss": 0.0,
      "step": 26,
      "step_time": 6.951123436999865
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.02083333395421505,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5625,
      "completions/mean_terminated_length": 2.5625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8414878621697426,
      "epoch": 0.00027,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.008148695342242718,
      "kl": 0.01356741931522265,
      "learning_rate": 5.942857142857143e-06,
      "loss": 0.0001,
      "num_tokens": 666818.0,
      "reward": 1.8875000476837158,
      "reward_std": 1.1053389310836792,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5625,
      "rewards/probe_completion_length/std": 0.504016101360321,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.8125,
      "rewards/probe_shaping_dominance/std": 0.3965577781200409,
      "rewards/probe_terminal_raw/mean": 0.1875,
      "rewards/probe_terminal_raw/std": 0.3965577781200409,
      "rewards/rollout_reward_func/mean": -0.625,
      "rewards/rollout_reward_func/std": 0.7931155562400818,
      "sampling/importance_sampling_ratio/max": 1.8690545558929443,
      "sampling/importance_sampling_ratio/mean": 0.9360212087631226,
      "sampling/importance_sampling_ratio/min": 0.3559110760688782,
      "sampling/sampling_logp_difference/max": 0.7921642065048218,
      "sampling/sampling_logp_difference/mean": 0.0954742580652237,
      "step": 27,
      "step_time": 10.1197560600001
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.02083333395421505,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "entropy": 0.8348528742790222,
      "epoch": 0.00028,
      "grad_norm": 0.009077177383005619,
      "kl": 0.03206715080887079,
      "learning_rate": 6.171428571428571e-06,
      "loss": 0.0001,
      "step": 28,
      "step_time": 6.045674705999772
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5625,
      "completions/mean_terminated_length": 2.5625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8990254625678062,
      "epoch": 0.00029,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 9.727512224344537e-05,
      "kl": 0.011075327638536692,
      "learning_rate": 6.4e-06,
      "loss": 0.0,
      "num_tokens": 715066.0,
      "reward": 1.5125000476837158,
      "reward_std": 0.504016101360321,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5625,
      "rewards/probe_completion_length/std": 0.504016101360321,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.8236454725265503,
      "sampling/importance_sampling_ratio/mean": 0.8915936946868896,
      "sampling/importance_sampling_ratio/min": 0.4869925379753113,
      "sampling/sampling_logp_difference/max": 0.7040233612060547,
      "sampling/sampling_logp_difference/mean": 0.0989106148481369,
      "step": 29,
      "step_time": 10.703582490000144
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.90165825933218,
      "epoch": 0.0003,
      "grad_norm": 2.8979713533772156e-05,
      "kl": 0.0033032803366950247,
      "learning_rate": 6.628571428571428e-06,
      "loss": 0.0,
      "step": 30,
      "step_time": 6.642205023999736
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.21875,
      "completions/mean_terminated_length": 2.21875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9571298733353615,
      "epoch": 0.00031,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 5.247951048659161e-05,
      "kl": 0.012058097088811337,
      "learning_rate": 6.857142857142856e-06,
      "loss": 0.0,
      "num_tokens": 765477.0,
      "reward": 1.1687500476837158,
      "reward_std": 0.420013427734375,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.21875,
      "rewards/probe_completion_length/std": 0.420013427734375,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.1994438171386719,
      "sampling/importance_sampling_ratio/mean": 0.8842767477035522,
      "sampling/importance_sampling_ratio/min": 0.625706136226654,
      "sampling/sampling_logp_difference/max": 0.3384011387825012,
      "sampling/sampling_logp_difference/mean": 0.08385618031024933,
      "step": 31,
      "step_time": 9.973356800999909
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.9696694537997246,
      "epoch": 0.00032,
      "grad_norm": 9.517256694380194e-05,
      "kl": 0.013399210826719354,
      "learning_rate": 7.085714285714285e-06,
      "loss": 0.0,
      "step": 32,
      "step_time": 6.042420091000054
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.75,
      "completions/mean_terminated_length": 2.75,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8252013362944126,
      "epoch": 0.00033,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.007044240366667509,
      "kl": 0.1163103471044451,
      "learning_rate": 7.314285714285714e-06,
      "loss": -0.0,
      "num_tokens": 806728.0,
      "reward": 2.262500047683716,
      "reward_std": 1.1482806205749512,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.75,
      "rewards/probe_completion_length/std": 0.4399413466453552,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.71875,
      "rewards/probe_shaping_dominance/std": 0.45680341124534607,
      "rewards/probe_terminal_raw/mean": 0.28125,
      "rewards/probe_terminal_raw/std": 0.45680341124534607,
      "rewards/rollout_reward_func/mean": -0.4375,
      "rewards/rollout_reward_func/std": 0.9136068224906921,
      "sampling/importance_sampling_ratio/max": 1.6308449506759644,
      "sampling/importance_sampling_ratio/mean": 0.8596781492233276,
      "sampling/importance_sampling_ratio/min": 0.3087958097457886,
      "sampling/sampling_logp_difference/max": 1.070175290107727,
      "sampling/sampling_logp_difference/mean": 0.10737159848213196,
      "step": 33,
      "step_time": 9.889731536
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.031250000931322575,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0416666679084301,
      "entropy": 0.8173648975789547,
      "epoch": 0.00034,
      "grad_norm": 0.004044355824589729,
      "kl": 0.1296040709130466,
      "learning_rate": 7.542857142857142e-06,
      "loss": -0.0,
      "step": 34,
      "step_time": 6.904155912999954
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.53125,
      "completions/mean_terminated_length": 2.53125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9066675156354904,
      "epoch": 0.00035,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 6.820289127063006e-05,
      "kl": 0.021789020313008223,
      "learning_rate": 7.771428571428572e-06,
      "loss": 0.0,
      "num_tokens": 860935.0,
      "reward": 1.4812500476837158,
      "reward_std": 0.507007360458374,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.53125,
      "rewards/probe_completion_length/std": 0.507007360458374,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.1967573165893555,
      "sampling/importance_sampling_ratio/mean": 0.8414925336837769,
      "sampling/importance_sampling_ratio/min": 0.4453066885471344,
      "sampling/sampling_logp_difference/max": 0.7035496234893799,
      "sampling/sampling_logp_difference/mean": 0.0921107828617096,
      "step": 35,
      "step_time": 10.496407848999866
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8933523520827293,
      "epoch": 0.00036,
      "grad_norm": 0.00010574555926723406,
      "kl": 0.02637881641567219,
      "learning_rate": 8e-06,
      "loss": 0.0,
      "step": 36,
      "step_time": 6.320377744999973
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.84375,
      "completions/mean_terminated_length": 2.84375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.5705685466527939,
      "epoch": 0.00037,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 1.521575904916972e-05,
      "kl": 0.0023586903529349,
      "learning_rate": 7.99999999962976e-06,
      "loss": 0.0,
      "num_tokens": 904680.0,
      "reward": 2.793750047683716,
      "reward_std": 1.221035361289978,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.84375,
      "rewards/probe_completion_length/std": 0.3689020276069641,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 1.0734606981277466,
      "sampling/importance_sampling_ratio/mean": 0.8809072971343994,
      "sampling/importance_sampling_ratio/min": 0.6448503136634827,
      "sampling/sampling_logp_difference/max": 0.47716373205184937,
      "sampling/sampling_logp_difference/mean": 0.058208562433719635,
      "step": 37,
      "step_time": 9.859377416000143
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.5728087574243546,
      "epoch": 0.00038,
      "grad_norm": 4.920143328490667e-05,
      "kl": 0.005856037396995362,
      "learning_rate": 7.99999999851904e-06,
      "loss": 0.0,
      "step": 38,
      "step_time": 6.990694649000034
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 7.0,
      "completions/max_terminated_length": 7.0,
      "completions/mean_length": 2.59375,
      "completions/mean_terminated_length": 2.59375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8889300264418125,
      "epoch": 0.00039,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 6.715894414810464e-05,
      "kl": 0.19043646939098835,
      "learning_rate": 7.999999996667841e-06,
      "loss": 0.0,
      "num_tokens": 952527.0,
      "reward": 2.106250047683716,
      "reward_std": 1.6482517719268799,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.65625,
      "rewards/probe_completion_length/std": 1.260040283203125,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.661733865737915,
      "sampling/importance_sampling_ratio/mean": 0.9227524995803833,
      "sampling/importance_sampling_ratio/min": 0.49729371070861816,
      "sampling/sampling_logp_difference/max": 0.796552300453186,
      "sampling/sampling_logp_difference/mean": 0.08508022874593735,
      "step": 39,
      "step_time": 10.035715292000077
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8587775826454163,
      "epoch": 0.0004,
      "grad_norm": 7.648151949979365e-05,
      "kl": 0.19953790280851535,
      "learning_rate": 7.999999994076165e-06,
      "loss": 0.0,
      "step": 40,
      "step_time": 6.051480846000004
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.625,
      "completions/mean_terminated_length": 2.625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.840924471616745,
      "epoch": 0.00041,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00012767076259478927,
      "kl": 0.061314951490203384,
      "learning_rate": 7.999999990744006e-06,
      "loss": 0.0,
      "num_tokens": 1007061.0,
      "reward": 1.5750000476837158,
      "reward_std": 0.49186936020851135,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.625,
      "rewards/probe_completion_length/std": 0.49186936020851135,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.7941770553588867,
      "sampling/importance_sampling_ratio/mean": 0.9026079177856445,
      "sampling/importance_sampling_ratio/min": 0.39223504066467285,
      "sampling/sampling_logp_difference/max": 0.6463124752044678,
      "sampling/sampling_logp_difference/mean": 0.10292024910449982,
      "step": 41,
      "step_time": 10.33222729900001
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8315484449267387,
      "epoch": 0.00042,
      "grad_norm": 0.00022670082398690283,
      "kl": 0.085723073239933,
      "learning_rate": 7.999999986671369e-06,
      "loss": 0.0,
      "step": 42,
      "step_time": 7.211746527999935
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.02083333395421505,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.65625,
      "completions/mean_terminated_length": 2.65625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.6875193417072296,
      "epoch": 0.00043,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.010541373863816261,
      "kl": 0.5206158077344298,
      "learning_rate": 7.999999981858253e-06,
      "loss": -0.0002,
      "num_tokens": 1052251.0,
      "reward": 2.106250047683716,
      "reward_std": 1.1670026779174805,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.65625,
      "rewards/probe_completion_length/std": 0.4825586974620819,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 2.845515251159668,
      "sampling/importance_sampling_ratio/mean": 0.9331997632980347,
      "sampling/importance_sampling_ratio/min": 0.27450695633888245,
      "sampling/sampling_logp_difference/max": 1.1816264390945435,
      "sampling/sampling_logp_difference/mean": 0.09470215439796448,
      "step": 43,
      "step_time": 9.963842145999934
    },
    {
      "clip_ratio/high_max": 0.0416666679084301,
      "clip_ratio/high_mean": 0.02083333395421505,
      "clip_ratio/low_mean": 0.0416666679084301,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.06250000186264515,
      "entropy": 0.6695949211716652,
      "epoch": 0.00044,
      "grad_norm": 0.004431854467839003,
      "kl": 1.1412691371515393,
      "learning_rate": 7.999999976304658e-06,
      "loss": -0.0002,
      "step": 44,
      "step_time": 5.966688828000315
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 7.0,
      "completions/max_terminated_length": 7.0,
      "completions/mean_length": 2.5625,
      "completions/mean_terminated_length": 2.5625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8274488374590874,
      "epoch": 0.00045,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.006249376107007265,
      "kl": 0.19522789436450694,
      "learning_rate": 7.999999970010581e-06,
      "loss": -0.0,
      "num_tokens": 1099900.0,
      "reward": 2.012500047683716,
      "reward_std": 1.6251550912857056,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.625,
      "rewards/probe_completion_length/std": 1.263635277748108,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.78125,
      "rewards/probe_shaping_dominance/std": 0.420013427734375,
      "rewards/probe_terminal_raw/mean": 0.21875,
      "rewards/probe_terminal_raw/std": 0.420013427734375,
      "rewards/rollout_reward_func/mean": -0.5625,
      "rewards/rollout_reward_func/std": 0.84002685546875,
      "sampling/importance_sampling_ratio/max": 1.6532174348831177,
      "sampling/importance_sampling_ratio/mean": 0.85263991355896,
      "sampling/importance_sampling_ratio/min": 0.3612442910671234,
      "sampling/sampling_logp_difference/max": 0.6850485801696777,
      "sampling/sampling_logp_difference/mean": 0.09981696307659149,
      "step": 45,
      "step_time": 10.048606317000122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.8223946020007133,
      "epoch": 0.00046,
      "grad_norm": 0.005041504744440317,
      "kl": 0.9564005452157289,
      "learning_rate": 7.999999962976027e-06,
      "loss": -0.0,
      "step": 46,
      "step_time": 6.050002238000047
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.1875,
      "completions/mean_terminated_length": 2.1875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9545164927840233,
      "epoch": 0.00047,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 4.465667007025331e-05,
      "kl": 0.010277454368406325,
      "learning_rate": 7.999999955200991e-06,
      "loss": 0.0,
      "num_tokens": 1149711.0,
      "reward": 1.1375000476837158,
      "reward_std": 0.3965577781200409,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.1875,
      "rewards/probe_completion_length/std": 0.3965577781200409,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.72605562210083,
      "sampling/importance_sampling_ratio/mean": 0.9260122776031494,
      "sampling/importance_sampling_ratio/min": 0.6488375663757324,
      "sampling/sampling_logp_difference/max": 1.0003349781036377,
      "sampling/sampling_logp_difference/mean": 0.10305842757225037,
      "step": 47,
      "step_time": 11.073963104999962
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.9436408951878548,
      "epoch": 0.00048,
      "grad_norm": 3.3426829759264365e-05,
      "kl": 0.006428241502362653,
      "learning_rate": 7.999999946685478e-06,
      "loss": 0.0,
      "step": 48,
      "step_time": 6.095011092000163
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 8.0,
      "completions/max_terminated_length": 8.0,
      "completions/mean_length": 2.75,
      "completions/mean_terminated_length": 2.75,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8800734430551529,
      "epoch": 0.00049,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.023053472861647606,
      "kl": 2.7146486702840775,
      "learning_rate": 7.999999937429484e-06,
      "loss": -0.0001,
      "num_tokens": 1196495.0,
      "reward": 2.137500047683716,
      "reward_std": 1.6546610593795776,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.8125,
      "rewards/probe_completion_length/std": 1.4013242721557617,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.8125,
      "rewards/probe_shaping_dominance/std": 0.3965577781200409,
      "rewards/probe_terminal_raw/mean": 0.1875,
      "rewards/probe_terminal_raw/std": 0.3965577781200409,
      "rewards/rollout_reward_func/mean": -0.625,
      "rewards/rollout_reward_func/std": 0.7931155562400818,
      "sampling/importance_sampling_ratio/max": 2.174635648727417,
      "sampling/importance_sampling_ratio/mean": 0.8316177129745483,
      "sampling/importance_sampling_ratio/min": 0.007070684805512428,
      "sampling/sampling_logp_difference/max": 3.8774948120117188,
      "sampling/sampling_logp_difference/mean": 0.16757240891456604,
      "step": 49,
      "step_time": 9.95241602700014
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.924486055970192,
      "epoch": 0.0005,
      "grad_norm": 0.00967924203723669,
      "kl": 0.35286546498537064,
      "learning_rate": 7.999999927433012e-06,
      "loss": -0.0001,
      "step": 50,
      "step_time": 6.020178775999966
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 2.5,
      "completions/mean_terminated_length": 2.5,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8942786604166031,
      "epoch": 0.00051,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00024296288029290736,
      "kl": 0.08609151990822284,
      "learning_rate": 7.99999991669606e-06,
      "loss": 0.0,
      "num_tokens": 1248112.0,
      "reward": 1.4812500476837158,
      "reward_std": 0.9498514533042908,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.53125,
      "rewards/probe_completion_length/std": 0.949851393699646,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.209699273109436,
      "sampling/importance_sampling_ratio/mean": 0.8616048097610474,
      "sampling/importance_sampling_ratio/min": 0.36346468329429626,
      "sampling/sampling_logp_difference/max": 0.5839085578918457,
      "sampling/sampling_logp_difference/mean": 0.09252315759658813,
      "step": 51,
      "step_time": 10.509565642999974
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8962793052196503,
      "epoch": 0.00052,
      "grad_norm": 0.0008928372408263385,
      "kl": 0.07233380628167652,
      "learning_rate": 7.999999905218627e-06,
      "loss": 0.0,
      "step": 52,
      "step_time": 6.072045420999871
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.02083333395421505,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.75,
      "completions/mean_terminated_length": 2.75,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.6282005533576012,
      "epoch": 0.00053,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.010705840773880482,
      "kl": 1.7787348770652898,
      "learning_rate": 7.999999893000716e-06,
      "loss": -0.0001,
      "num_tokens": 1295553.0,
      "reward": 2.075000047683716,
      "reward_std": 1.008032202720642,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.75,
      "rewards/probe_completion_length/std": 0.4399413466453552,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.8125,
      "rewards/probe_shaping_dominance/std": 0.3965577781200409,
      "rewards/probe_terminal_raw/mean": 0.1875,
      "rewards/probe_terminal_raw/std": 0.3965577781200409,
      "rewards/rollout_reward_func/mean": -0.625,
      "rewards/rollout_reward_func/std": 0.7931155562400818,
      "sampling/importance_sampling_ratio/max": 1.718209981918335,
      "sampling/importance_sampling_ratio/mean": 0.8762518763542175,
      "sampling/importance_sampling_ratio/min": 0.0712529644370079,
      "sampling/sampling_logp_difference/max": 2.461639881134033,
      "sampling/sampling_logp_difference/mean": 0.13911215960979462,
      "step": 53,
      "step_time": 10.180075020999652
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.6223511770367622,
      "epoch": 0.00054,
      "grad_norm": 0.003766631940379739,
      "kl": 0.724787222861778,
      "learning_rate": 7.999999880042326e-06,
      "loss": -0.0001,
      "step": 54,
      "step_time": 6.618187610000177
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 8.0,
      "completions/max_terminated_length": 8.0,
      "completions/mean_length": 2.96875,
      "completions/mean_terminated_length": 2.96875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.48880940675735474,
      "epoch": 0.00055,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.05148077383637428,
      "kl": 10.946514587849379,
      "learning_rate": 7.999999866343456e-06,
      "loss": 0.0,
      "num_tokens": 1333718.0,
      "reward": 3.3562498092651367,
      "reward_std": 1.6036123037338257,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 2.03125,
      "rewards/probe_completion_length/std": 1.331610083580017,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.3125,
      "rewards/probe_shaping_dominance/std": 0.4709290862083435,
      "rewards/probe_terminal_raw/mean": 0.6875,
      "rewards/probe_terminal_raw/std": 0.4709290862083435,
      "rewards/rollout_reward_func/mean": 0.375,
      "rewards/rollout_reward_func/std": 0.941858172416687,
      "sampling/importance_sampling_ratio/max": 1.252049207687378,
      "sampling/importance_sampling_ratio/mean": 0.8787084221839905,
      "sampling/importance_sampling_ratio/min": 0.020720435306429863,
      "sampling/sampling_logp_difference/max": 3.2375454902648926,
      "sampling/sampling_logp_difference/mean": 0.12630514800548553,
      "step": 55,
      "step_time": 9.787316816999919
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.5050413832068443,
      "epoch": 0.00056,
      "grad_norm": 0.0006665181717835367,
      "kl": 1.611095119267702,
      "learning_rate": 7.999999851904105e-06,
      "loss": -0.0001,
      "step": 56,
      "step_time": 5.5423178839998855
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.78125,
      "completions/mean_terminated_length": 2.78125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.6285109668970108,
      "epoch": 0.00057,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.005352190230041742,
      "kl": 0.8933676667511463,
      "learning_rate": 7.999999836724277e-06,
      "loss": -0.0,
      "num_tokens": 1382055.0,
      "reward": 1.9812500476837158,
      "reward_std": 0.8607714176177979,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.78125,
      "rewards/probe_completion_length/std": 0.420013427734375,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.875,
      "rewards/probe_shaping_dominance/std": 0.33601075410842896,
      "rewards/probe_terminal_raw/mean": 0.125,
      "rewards/probe_terminal_raw/std": 0.33601075410842896,
      "rewards/rollout_reward_func/mean": -0.75,
      "rewards/rollout_reward_func/std": 0.6720215082168579,
      "sampling/importance_sampling_ratio/max": 1.6246511936187744,
      "sampling/importance_sampling_ratio/mean": 0.9118392467498779,
      "sampling/importance_sampling_ratio/min": 0.1631554365158081,
      "sampling/sampling_logp_difference/max": 1.7249137163162231,
      "sampling/sampling_logp_difference/mean": 0.108734130859375,
      "step": 57,
      "step_time": 9.972802231999822
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.6326157301664352,
      "epoch": 0.00058,
      "grad_norm": 0.005271768197417259,
      "kl": 0.7564798947423697,
      "learning_rate": 7.999999820803968e-06,
      "loss": -0.0,
      "step": 58,
      "step_time": 6.481488288000264
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.4375,
      "completions/mean_terminated_length": 2.4375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8018255531787872,
      "epoch": 0.00059,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 9.812507050810382e-05,
      "kl": 0.08816051934809366,
      "learning_rate": 7.99999980414318e-06,
      "loss": 0.0,
      "num_tokens": 1425529.0,
      "reward": 1.8875000476837158,
      "reward_std": 1.2684128284454346,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.4375,
      "rewards/probe_completion_length/std": 0.504016101360321,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 0.9522015452384949,
      "sampling/importance_sampling_ratio/mean": 0.8646754026412964,
      "sampling/importance_sampling_ratio/min": 0.16998648643493652,
      "sampling/sampling_logp_difference/max": 1.2199361324310303,
      "sampling/sampling_logp_difference/mean": 0.0755390077829361,
      "step": 59,
      "step_time": 9.85027468400017
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8057889193296432,
      "epoch": 0.0006,
      "grad_norm": 0.00017048718291334808,
      "kl": 0.08889914313340341,
      "learning_rate": 7.999999786741913e-06,
      "loss": 0.0,
      "step": 60,
      "step_time": 6.49511566000001
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.34375,
      "completions/mean_terminated_length": 2.34375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9359987080097198,
      "epoch": 0.00061,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006201486103236675,
      "kl": 0.1183123029768467,
      "learning_rate": 7.999999768600167e-06,
      "loss": 0.0,
      "num_tokens": 1475587.0,
      "reward": 1.2937500476837158,
      "reward_std": 0.4825586974620819,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.34375,
      "rewards/probe_completion_length/std": 0.4825586974620819,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.0106334686279297,
      "sampling/importance_sampling_ratio/mean": 0.8268498182296753,
      "sampling/importance_sampling_ratio/min": 0.16725747287273407,
      "sampling/sampling_logp_difference/max": 1.6995172500610352,
      "sampling/sampling_logp_difference/mean": 0.1090221256017685,
      "step": 61,
      "step_time": 10.056730226000127
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.920487180352211,
      "epoch": 0.00062,
      "grad_norm": 0.000263864261796698,
      "kl": 0.04295372625347227,
      "learning_rate": 7.99999974971794e-06,
      "loss": 0.0,
      "step": 62,
      "step_time": 6.643375879000132
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.65625,
      "completions/mean_terminated_length": 2.65625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.6303055956959724,
      "epoch": 0.00063,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 7.813687261659652e-05,
      "kl": 0.1321493198513508,
      "learning_rate": 7.999999730095235e-06,
      "loss": 0.0,
      "num_tokens": 1523106.0,
      "reward": 2.137500047683716,
      "reward_std": 1.1482806205749512,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.65625,
      "rewards/probe_completion_length/std": 0.4825586974620819,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.489367961883545,
      "sampling/importance_sampling_ratio/mean": 0.9285314083099365,
      "sampling/importance_sampling_ratio/min": 0.5652889609336853,
      "sampling/sampling_logp_difference/max": 0.46361231803894043,
      "sampling/sampling_logp_difference/mean": 0.05925370007753372,
      "step": 63,
      "step_time": 10.579496304000031
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.6239962689578533,
      "epoch": 0.00064,
      "grad_norm": 8.079901454038918e-05,
      "kl": 0.13366722106775342,
      "learning_rate": 7.99999970973205e-06,
      "loss": 0.0,
      "step": 64,
      "step_time": 6.11811525600001
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.4375,
      "completions/mean_terminated_length": 2.4375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.7411899194121361,
      "epoch": 0.00065,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 4.089561844011769e-05,
      "kl": 0.022076929230934184,
      "learning_rate": 7.999999688628386e-06,
      "loss": 0.0,
      "num_tokens": 1573719.0,
      "reward": 1.3875000476837158,
      "reward_std": 0.504016101360321,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.4375,
      "rewards/probe_completion_length/std": 0.504016101360321,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.1824043989181519,
      "sampling/importance_sampling_ratio/mean": 0.8717299103736877,
      "sampling/importance_sampling_ratio/min": 0.5599439740180969,
      "sampling/sampling_logp_difference/max": 0.4129828214645386,
      "sampling/sampling_logp_difference/mean": 0.06982976198196411,
      "step": 65,
      "step_time": 10.05901396799959
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.7376994863152504,
      "epoch": 0.00066,
      "grad_norm": 7.469410047633573e-05,
      "kl": 0.022887282819283428,
      "learning_rate": 7.999999666784243e-06,
      "loss": 0.0,
      "step": 66,
      "step_time": 7.14928144000055
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.6875,
      "completions/mean_terminated_length": 2.6875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.7326344400644302,
      "epoch": 0.00067,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.00794161669909954,
      "kl": 0.08390913903713226,
      "learning_rate": 7.999999644199619e-06,
      "loss": 0.0,
      "num_tokens": 1621426.0,
      "reward": 1.8875000476837158,
      "reward_std": 0.9136068224906921,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.6875,
      "rewards/probe_completion_length/std": 0.4709290862083435,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.875,
      "rewards/probe_shaping_dominance/std": 0.33601075410842896,
      "rewards/probe_terminal_raw/mean": 0.125,
      "rewards/probe_terminal_raw/std": 0.33601075410842896,
      "rewards/rollout_reward_func/mean": -0.75,
      "rewards/rollout_reward_func/std": 0.6720215082168579,
      "sampling/importance_sampling_ratio/max": 1.5268666744232178,
      "sampling/importance_sampling_ratio/mean": 0.9305216073989868,
      "sampling/importance_sampling_ratio/min": 0.3797222077846527,
      "sampling/sampling_logp_difference/max": 0.650033712387085,
      "sampling/sampling_logp_difference/mean": 0.08735646307468414,
      "step": 67,
      "step_time": 10.012451381999881
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0416666679084301,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0416666679084301,
      "entropy": 0.7200787663459778,
      "epoch": 0.00068,
      "grad_norm": 0.0036266117822378874,
      "kl": 0.0821327978046611,
      "learning_rate": 7.999999620874517e-06,
      "loss": -0.0,
      "step": 68,
      "step_time": 5.994549507999864
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.65625,
      "completions/mean_terminated_length": 2.65625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.7011678218841553,
      "epoch": 0.00069,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.004761384334415197,
      "kl": 0.07737505971454084,
      "learning_rate": 7.999999596808934e-06,
      "loss": -0.0,
      "num_tokens": 1670605.0,
      "reward": 1.8562500476837158,
      "reward_std": 0.9283830523490906,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.65625,
      "rewards/probe_completion_length/std": 0.4825586974620819,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.875,
      "rewards/probe_shaping_dominance/std": 0.33601075410842896,
      "rewards/probe_terminal_raw/mean": 0.125,
      "rewards/probe_terminal_raw/std": 0.33601075410842896,
      "rewards/rollout_reward_func/mean": -0.75,
      "rewards/rollout_reward_func/std": 0.6720215082168579,
      "sampling/importance_sampling_ratio/max": 1.6977269649505615,
      "sampling/importance_sampling_ratio/mean": 0.9019441604614258,
      "sampling/importance_sampling_ratio/min": 0.0,
      "sampling/sampling_logp_difference/max": 1.9512550830841064,
      "sampling/sampling_logp_difference/mean": 0.09673301875591278,
      "step": 69,
      "step_time": 10.269808110999975
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.7132209166884422,
      "epoch": 0.0007,
      "grad_norm": 0.0037254930939525366,
      "kl": 0.04896980174817145,
      "learning_rate": 7.999999572002872e-06,
      "loss": -0.0,
      "step": 70,
      "step_time": 7.155508138000187
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.53125,
      "completions/mean_terminated_length": 2.53125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8009077645838261,
      "epoch": 0.00071,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 9.20472084544599e-05,
      "kl": 0.08694314991589636,
      "learning_rate": 7.999999546456332e-06,
      "loss": 0.0,
      "num_tokens": 1718484.0,
      "reward": 1.9812500476837158,
      "reward_std": 1.2309025526046753,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.53125,
      "rewards/probe_completion_length/std": 0.507007360458374,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.4197654724121094,
      "sampling/importance_sampling_ratio/mean": 0.8803520202636719,
      "sampling/importance_sampling_ratio/min": 0.5503826141357422,
      "sampling/sampling_logp_difference/max": 0.5132150053977966,
      "sampling/sampling_logp_difference/mean": 0.07343609631061554,
      "step": 71,
      "step_time": 10.061626925999917
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.7975645214319229,
      "epoch": 0.00072,
      "grad_norm": 8.292407437693328e-05,
      "kl": 0.08916806877823547,
      "learning_rate": 7.999999520169313e-06,
      "loss": 0.0,
      "step": 72,
      "step_time": 6.039824374000318
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.75,
      "completions/mean_terminated_length": 2.75,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.6600424498319626,
      "epoch": 0.00073,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.002133256522938609,
      "kl": 0.0518730686235358,
      "learning_rate": 7.999999493141815e-06,
      "loss": -0.0,
      "num_tokens": 1769135.0,
      "reward": 1.7625000476837158,
      "reward_std": 0.5922891497612,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.75,
      "rewards/probe_completion_length/std": 0.4399413466453552,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.96875,
      "rewards/probe_shaping_dominance/std": 0.1767766922712326,
      "rewards/probe_terminal_raw/mean": 0.03125,
      "rewards/probe_terminal_raw/std": 0.1767766922712326,
      "rewards/rollout_reward_func/mean": -0.9375,
      "rewards/rollout_reward_func/std": 0.3535533845424652,
      "sampling/importance_sampling_ratio/max": 1.2418115139007568,
      "sampling/importance_sampling_ratio/mean": 0.8889593482017517,
      "sampling/importance_sampling_ratio/min": 0.7007108926773071,
      "sampling/sampling_logp_difference/max": 0.29283052682876587,
      "sampling/sampling_logp_difference/mean": 0.05654953420162201,
      "step": 73,
      "step_time": 10.22246037199966
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.6605305559933186,
      "epoch": 0.00074,
      "grad_norm": 0.0020768321119248867,
      "kl": 0.05201199743896723,
      "learning_rate": 7.999999465373833e-06,
      "loss": -0.0,
      "step": 74,
      "step_time": 7.141134875000034
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.6875,
      "completions/mean_terminated_length": 2.6875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.707104254513979,
      "epoch": 0.00075,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0004946052795276046,
      "kl": 0.30471506575122476,
      "learning_rate": 7.999999436865376e-06,
      "loss": 0.0,
      "num_tokens": 1818515.0,
      "reward": 1.7000000476837158,
      "reward_std": 0.6221709847450256,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.6875,
      "rewards/probe_completion_length/std": 0.4709290862083435,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.96875,
      "rewards/probe_shaping_dominance/std": 0.1767766922712326,
      "rewards/probe_terminal_raw/mean": 0.03125,
      "rewards/probe_terminal_raw/std": 0.1767766922712326,
      "rewards/rollout_reward_func/mean": -0.9375,
      "rewards/rollout_reward_func/std": 0.3535533845424652,
      "sampling/importance_sampling_ratio/max": 1.143739938735962,
      "sampling/importance_sampling_ratio/mean": 0.8724870085716248,
      "sampling/importance_sampling_ratio/min": 0.6202055811882019,
      "sampling/sampling_logp_difference/max": 0.41298508644104004,
      "sampling/sampling_logp_difference/mean": 0.06320011615753174,
      "step": 75,
      "step_time": 10.026756965999766
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.7264066264033318,
      "epoch": 0.00076,
      "grad_norm": 0.0005243932246230543,
      "kl": 0.32161051593720913,
      "learning_rate": 7.999999407616439e-06,
      "loss": -0.0,
      "step": 76,
      "step_time": 6.015877203999935
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.6875,
      "completions/mean_terminated_length": 2.6875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.6747354827821255,
      "epoch": 0.00077,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0009918597061187029,
      "kl": 0.06751795881427824,
      "learning_rate": 7.999999377627022e-06,
      "loss": -0.0,
      "num_tokens": 1864433.0,
      "reward": 2.293750047683716,
      "reward_std": 1.2077537775039673,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.6875,
      "rewards/probe_completion_length/std": 0.4709290862083435,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.6875,
      "rewards/probe_shaping_dominance/std": 0.4709290862083435,
      "rewards/probe_terminal_raw/mean": 0.3125,
      "rewards/probe_terminal_raw/std": 0.4709290862083435,
      "rewards/rollout_reward_func/mean": -0.375,
      "rewards/rollout_reward_func/std": 0.941858172416687,
      "sampling/importance_sampling_ratio/max": 1.3293083906173706,
      "sampling/importance_sampling_ratio/mean": 0.9025354385375977,
      "sampling/importance_sampling_ratio/min": 0.5597118139266968,
      "sampling/sampling_logp_difference/max": 0.8906712532043457,
      "sampling/sampling_logp_difference/mean": 0.0869922935962677,
      "step": 77,
      "step_time": 10.032041373000084
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.672482568770647,
      "epoch": 0.00078,
      "grad_norm": 0.001063486561179161,
      "kl": 0.09529352828394622,
      "learning_rate": 7.999999346897126e-06,
      "loss": -0.0,
      "step": 78,
      "step_time": 6.591032714999983
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.40625,
      "completions/mean_terminated_length": 2.40625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9129854887723923,
      "epoch": 0.00079,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0032934746704995632,
      "kl": 0.28487967513501644,
      "learning_rate": 7.99999931542675e-06,
      "loss": -0.0,
      "num_tokens": 1911461.0,
      "reward": 1.6687500476837158,
      "reward_std": 1.0846248865127563,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.40625,
      "rewards/probe_completion_length/std": 0.49899089336395264,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.84375,
      "rewards/probe_shaping_dominance/std": 0.3689020276069641,
      "rewards/probe_terminal_raw/mean": 0.15625,
      "rewards/probe_terminal_raw/std": 0.3689020276069641,
      "rewards/rollout_reward_func/mean": -0.6875,
      "rewards/rollout_reward_func/std": 0.7378040552139282,
      "sampling/importance_sampling_ratio/max": 1.3015565872192383,
      "sampling/importance_sampling_ratio/mean": 0.878451943397522,
      "sampling/importance_sampling_ratio/min": 0.25084182620048523,
      "sampling/sampling_logp_difference/max": 0.9859634637832642,
      "sampling/sampling_logp_difference/mean": 0.09893403947353363,
      "step": 79,
      "step_time": 10.137737221999942
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.9050725996494293,
      "epoch": 0.0008,
      "grad_norm": 0.0033992675598710775,
      "kl": 0.29830983486317564,
      "learning_rate": 7.999999283215897e-06,
      "loss": -0.0,
      "step": 80,
      "step_time": 6.156149169999935
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.02083333395421505,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.625,
      "completions/mean_terminated_length": 2.625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.7616499289870262,
      "epoch": 0.00081,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0003873612731695175,
      "kl": 0.2376087919692509,
      "learning_rate": 7.999999250264562e-06,
      "loss": -0.0001,
      "num_tokens": 1959182.0,
      "reward": 1.9500000476837158,
      "reward_std": 1.0776318311691284,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.625,
      "rewards/probe_completion_length/std": 0.49186936020851135,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.8125,
      "rewards/probe_shaping_dominance/std": 0.3965577781200409,
      "rewards/probe_terminal_raw/mean": 0.1875,
      "rewards/probe_terminal_raw/std": 0.3965577781200409,
      "rewards/rollout_reward_func/mean": -0.625,
      "rewards/rollout_reward_func/std": 0.7931155562400818,
      "sampling/importance_sampling_ratio/max": 1.5489988327026367,
      "sampling/importance_sampling_ratio/mean": 0.9243887662887573,
      "sampling/importance_sampling_ratio/min": 0.0,
      "sampling/sampling_logp_difference/max": 1.2893295288085938,
      "sampling/sampling_logp_difference/mean": 0.0874212235212326,
      "step": 81,
      "step_time": 10.081119074000526
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.7553209736943245,
      "epoch": 0.00082,
      "grad_norm": 0.0003225726541131735,
      "kl": 0.2503548744134605,
      "learning_rate": 7.999999216572749e-06,
      "loss": -0.0001,
      "step": 82,
      "step_time": 7.087644705999992
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.6875,
      "completions/mean_terminated_length": 2.6875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.5698596127331257,
      "epoch": 0.00083,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.005867909174412489,
      "kl": 0.5150133110582829,
      "learning_rate": 7.999999182140456e-06,
      "loss": -0.0,
      "num_tokens": 2004465.0,
      "reward": 2.325000047683716,
      "reward_std": 1.263635277748108,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.6875,
      "rewards/probe_completion_length/std": 0.4709290862083435,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.65625,
      "rewards/probe_shaping_dominance/std": 0.4825586974620819,
      "rewards/probe_terminal_raw/mean": 0.34375,
      "rewards/probe_terminal_raw/std": 0.4825586974620819,
      "rewards/rollout_reward_func/mean": -0.3125,
      "rewards/rollout_reward_func/std": 0.9651173949241638,
      "sampling/importance_sampling_ratio/max": 0.9957227110862732,
      "sampling/importance_sampling_ratio/mean": 0.883823037147522,
      "sampling/importance_sampling_ratio/min": 0.38355496525764465,
      "sampling/sampling_logp_difference/max": 0.884765625,
      "sampling/sampling_logp_difference/mean": 0.05957601219415665,
      "step": 83,
      "step_time": 9.975995759000398
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.012500000186264515,
      "entropy": 0.5635711327195168,
      "epoch": 0.00084,
      "grad_norm": 0.0055901966989040375,
      "kl": 0.6353859202936292,
      "learning_rate": 7.999999146967684e-06,
      "loss": -0.0001,
      "step": 84,
      "step_time": 6.023201422999591
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.65625,
      "completions/mean_terminated_length": 2.65625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.6848401352763176,
      "epoch": 0.00085,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0006412569200620055,
      "kl": 0.15268925488635432,
      "learning_rate": 7.999999111054434e-06,
      "loss": -0.0,
      "num_tokens": 2046568.0,
      "reward": 2.481250047683716,
      "reward_std": 1.3674646615982056,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.65625,
      "rewards/probe_completion_length/std": 0.4825586974620819,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5625,
      "rewards/probe_shaping_dominance/std": 0.504016101360321,
      "rewards/probe_terminal_raw/mean": 0.4375,
      "rewards/probe_terminal_raw/std": 0.504016101360321,
      "rewards/rollout_reward_func/mean": -0.125,
      "rewards/rollout_reward_func/std": 1.008032202720642,
      "sampling/importance_sampling_ratio/max": 1.0659711360931396,
      "sampling/importance_sampling_ratio/mean": 0.8968173861503601,
      "sampling/importance_sampling_ratio/min": 0.6035577058792114,
      "sampling/sampling_logp_difference/max": 0.39724525809288025,
      "sampling/sampling_logp_difference/mean": 0.06118996441364288,
      "step": 85,
      "step_time": 10.44943671900046
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.6844260990619659,
      "epoch": 0.00086,
      "grad_norm": 0.001066790777258575,
      "kl": 0.16099315127939917,
      "learning_rate": 7.999999074400703e-06,
      "loss": -0.0,
      "step": 86,
      "step_time": 5.960067805999643
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.6875,
      "completions/mean_terminated_length": 2.6875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.6104786917567253,
      "epoch": 0.00087,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.006237240973860025,
      "kl": 0.5770804272033274,
      "learning_rate": 7.999999037006494e-06,
      "loss": -0.0,
      "num_tokens": 2092772.0,
      "reward": 2.075000047683716,
      "reward_std": 1.0998533964157104,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.6875,
      "rewards/probe_completion_length/std": 0.4709290862083435,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.78125,
      "rewards/probe_shaping_dominance/std": 0.420013427734375,
      "rewards/probe_terminal_raw/mean": 0.21875,
      "rewards/probe_terminal_raw/std": 0.420013427734375,
      "rewards/rollout_reward_func/mean": -0.5625,
      "rewards/rollout_reward_func/std": 0.84002685546875,
      "sampling/importance_sampling_ratio/max": 1.9442269802093506,
      "sampling/importance_sampling_ratio/mean": 0.9282306432723999,
      "sampling/importance_sampling_ratio/min": 0.5401378870010376,
      "sampling/sampling_logp_difference/max": 0.6658430099487305,
      "sampling/sampling_logp_difference/mean": 0.06711626052856445,
      "step": 87,
      "step_time": 10.2343749260001
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.6204207167029381,
      "epoch": 0.00088,
      "grad_norm": 0.006610149517655373,
      "kl": 0.5862025530077517,
      "learning_rate": 7.999998998871805e-06,
      "loss": -0.0,
      "step": 88,
      "step_time": 6.1646545170001446
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.625,
      "completions/mean_terminated_length": 2.625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8936571925878525,
      "epoch": 0.00089,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.002412655623629689,
      "kl": 0.3142129776533693,
      "learning_rate": 7.999998959996637e-06,
      "loss": 0.0,
      "num_tokens": 2137465.0,
      "reward": 1.9500000476837158,
      "reward_std": 1.0776318311691284,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.625,
      "rewards/probe_completion_length/std": 0.49186936020851135,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.8125,
      "rewards/probe_shaping_dominance/std": 0.3965577781200409,
      "rewards/probe_terminal_raw/mean": 0.1875,
      "rewards/probe_terminal_raw/std": 0.3965577781200409,
      "rewards/rollout_reward_func/mean": -0.625,
      "rewards/rollout_reward_func/std": 0.7931155562400818,
      "sampling/importance_sampling_ratio/max": 1.3561961650848389,
      "sampling/importance_sampling_ratio/mean": 0.889546275138855,
      "sampling/importance_sampling_ratio/min": 0.3530333340167999,
      "sampling/sampling_logp_difference/max": 0.656822681427002,
      "sampling/sampling_logp_difference/mean": 0.09530897438526154,
      "step": 89,
      "step_time": 10.901634533000333
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8688658103346825,
      "epoch": 0.0009,
      "grad_norm": 0.002043940592557192,
      "kl": 0.29210204584524035,
      "learning_rate": 7.99999892038099e-06,
      "loss": 0.0,
      "step": 90,
      "step_time": 5.993553177000194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.625,
      "completions/mean_terminated_length": 2.625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.6592781916260719,
      "epoch": 0.00091,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.004781234543770552,
      "kl": 1.3648682069979259,
      "learning_rate": 7.999998880024863e-06,
      "loss": -0.0,
      "num_tokens": 2184283.0,
      "reward": 1.9187500476837158,
      "reward_std": 0.999495804309845,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.625,
      "rewards/probe_completion_length/std": 0.49186936020851135,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.84375,
      "rewards/probe_shaping_dominance/std": 0.3689020276069641,
      "rewards/probe_terminal_raw/mean": 0.15625,
      "rewards/probe_terminal_raw/std": 0.3689020276069641,
      "rewards/rollout_reward_func/mean": -0.6875,
      "rewards/rollout_reward_func/std": 0.7378040552139282,
      "sampling/importance_sampling_ratio/max": 1.1354765892028809,
      "sampling/importance_sampling_ratio/mean": 0.8667730689048767,
      "sampling/importance_sampling_ratio/min": 0.1865314096212387,
      "sampling/sampling_logp_difference/max": 1.308552861213684,
      "sampling/sampling_logp_difference/mean": 0.0986025482416153,
      "step": 91,
      "step_time": 9.94715041100062
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.02291666716337204,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02291666716337204,
      "entropy": 0.6524480134248734,
      "epoch": 0.00092,
      "grad_norm": 0.003115040250122547,
      "kl": 1.1155421955772908,
      "learning_rate": 7.999998838928257e-06,
      "loss": -0.0,
      "step": 92,
      "step_time": 6.535503675999735
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.625,
      "completions/mean_terminated_length": 2.625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.7372243441641331,
      "epoch": 0.00093,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.002905595349147916,
      "kl": 0.41144446027465165,
      "learning_rate": 7.999998797091172e-06,
      "loss": -0.0,
      "num_tokens": 2232643.0,
      "reward": 1.9500000476837158,
      "reward_std": 1.0776318311691284,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.625,
      "rewards/probe_completion_length/std": 0.49186936020851135,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.8125,
      "rewards/probe_shaping_dominance/std": 0.3965577781200409,
      "rewards/probe_terminal_raw/mean": 0.1875,
      "rewards/probe_terminal_raw/std": 0.3965577781200409,
      "rewards/rollout_reward_func/mean": -0.625,
      "rewards/rollout_reward_func/std": 0.7931155562400818,
      "sampling/importance_sampling_ratio/max": 1.311123251914978,
      "sampling/importance_sampling_ratio/mean": 0.8911527991294861,
      "sampling/importance_sampling_ratio/min": 0.5693881511688232,
      "sampling/sampling_logp_difference/max": 0.5735888481140137,
      "sampling/sampling_logp_difference/mean": 0.07083894312381744,
      "step": 93,
      "step_time": 10.464223708999953
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.7358968108892441,
      "epoch": 0.00094,
      "grad_norm": 0.004680361598730087,
      "kl": 0.4150165840983391,
      "learning_rate": 7.999998754513608e-06,
      "loss": -0.0,
      "step": 94,
      "step_time": 6.073112994999974
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5625,
      "completions/mean_terminated_length": 2.5625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.7264605090022087,
      "epoch": 0.00095,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.005375358276069164,
      "kl": 0.23870017855006154,
      "learning_rate": 7.999998711195565e-06,
      "loss": -0.0,
      "num_tokens": 2275802.0,
      "reward": 2.262500047683716,
      "reward_std": 1.3781123161315918,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5625,
      "rewards/probe_completion_length/std": 0.504016101360321,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.625,
      "rewards/probe_shaping_dominance/std": 0.49186936020851135,
      "rewards/probe_terminal_raw/mean": 0.375,
      "rewards/probe_terminal_raw/std": 0.49186936020851135,
      "rewards/rollout_reward_func/mean": -0.25,
      "rewards/rollout_reward_func/std": 0.9837387204170227,
      "sampling/importance_sampling_ratio/max": 1.2891932725906372,
      "sampling/importance_sampling_ratio/mean": 0.9060554504394531,
      "sampling/importance_sampling_ratio/min": 0.40840575098991394,
      "sampling/sampling_logp_difference/max": 0.8128976225852966,
      "sampling/sampling_logp_difference/mean": 0.0853162333369255,
      "step": 95,
      "step_time": 9.82769083400035
    },
    {
      "clip_ratio/high_max": 0.02500000037252903,
      "clip_ratio/high_mean": 0.012500000186264515,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.012500000186264515,
      "entropy": 0.735051341354847,
      "epoch": 0.00096,
      "grad_norm": 0.0030660636257380247,
      "kl": 0.2500158410985023,
      "learning_rate": 7.999998667137043e-06,
      "loss": -0.0,
      "step": 96,
      "step_time": 6.971147591000317
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 8.0,
      "completions/max_terminated_length": 8.0,
      "completions/mean_length": 2.84375,
      "completions/mean_terminated_length": 2.84375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.640817366540432,
      "epoch": 0.00097,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0021315033081918955,
      "kl": 0.6351375498416019,
      "learning_rate": 7.999998622338041e-06,
      "loss": -0.0,
      "num_tokens": 2315463.0,
      "reward": 2.6687498092651367,
      "reward_std": 1.7456743717193604,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 1.3762823343276978,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.59375,
      "rewards/probe_shaping_dominance/std": 0.49899089336395264,
      "rewards/probe_terminal_raw/mean": 0.40625,
      "rewards/probe_terminal_raw/std": 0.49899089336395264,
      "rewards/rollout_reward_func/mean": -0.1875,
      "rewards/rollout_reward_func/std": 0.9979817867279053,
      "sampling/importance_sampling_ratio/max": 1.2316614389419556,
      "sampling/importance_sampling_ratio/mean": 0.8954340219497681,
      "sampling/importance_sampling_ratio/min": 0.5406860709190369,
      "sampling/sampling_logp_difference/max": 0.47210049629211426,
      "sampling/sampling_logp_difference/mean": 0.05805978551506996,
      "step": 97,
      "step_time": 9.842610594000234
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.012500000186264515,
      "entropy": 0.637968648225069,
      "epoch": 0.00098,
      "grad_norm": 0.0014506971929222345,
      "kl": 0.670122139959858,
      "learning_rate": 7.999998576798562e-06,
      "loss": -0.0,
      "step": 98,
      "step_time": 5.918145998000227
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.53125,
      "completions/mean_terminated_length": 2.53125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.6516329012811184,
      "epoch": 0.00099,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0019182172836735845,
      "kl": 0.6717661089580815,
      "learning_rate": 7.999998530518601e-06,
      "loss": -0.0001,
      "num_tokens": 2361282.0,
      "reward": 2.106250047683716,
      "reward_std": 1.322494626045227,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.53125,
      "rewards/probe_completion_length/std": 0.507007360458374,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.6875,
      "rewards/probe_shaping_dominance/std": 0.4709290862083435,
      "rewards/probe_terminal_raw/mean": 0.3125,
      "rewards/probe_terminal_raw/std": 0.4709290862083435,
      "rewards/rollout_reward_func/mean": -0.375,
      "rewards/rollout_reward_func/std": 0.941858172416687,
      "sampling/importance_sampling_ratio/max": 1.2388545274734497,
      "sampling/importance_sampling_ratio/mean": 0.840278148651123,
      "sampling/importance_sampling_ratio/min": 0.172871395945549,
      "sampling/sampling_logp_difference/max": 1.687204122543335,
      "sampling/sampling_logp_difference/mean": 0.10819916427135468,
      "step": 99,
      "step_time": 10.378238312000121
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.661732904613018,
      "epoch": 0.001,
      "grad_norm": 0.002160546835511923,
      "kl": 0.6096960648123968,
      "learning_rate": 7.999998483498162e-06,
      "loss": -0.0001,
      "step": 100,
      "step_time": 6.532955088000108
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5625,
      "completions/mean_terminated_length": 2.5625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.7278852015733719,
      "epoch": 0.00101,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00023665007029194385,
      "kl": 0.15003768350288738,
      "learning_rate": 7.999998435737244e-06,
      "loss": 0.0,
      "num_tokens": 2406084.0,
      "reward": 2.012500047683716,
      "reward_std": 1.2164862155914307,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5625,
      "rewards/probe_completion_length/std": 0.504016101360321,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.0562355518341064,
      "sampling/importance_sampling_ratio/mean": 0.8963831663131714,
      "sampling/importance_sampling_ratio/min": 0.3585115075111389,
      "sampling/sampling_logp_difference/max": 0.5996613502502441,
      "sampling/sampling_logp_difference/mean": 0.069548100233078,
      "step": 101,
      "step_time": 9.839882647999957
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.7269566580653191,
      "epoch": 0.00102,
      "grad_norm": 0.0002451199106872082,
      "kl": 0.16160269570536911,
      "learning_rate": 7.999998387235846e-06,
      "loss": 0.0,
      "step": 102,
      "step_time": 5.938691667999592
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.53125,
      "completions/mean_terminated_length": 2.53125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.5686472542583942,
      "epoch": 0.00103,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 8.88900613063015e-05,
      "kl": 0.05235966534110048,
      "learning_rate": 7.99999833799397e-06,
      "loss": 0.0,
      "num_tokens": 2447237.0,
      "reward": 2.481250047683716,
      "reward_std": 1.5023503303527832,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.53125,
      "rewards/probe_completion_length/std": 0.507007360458374,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 1.6354694366455078,
      "sampling/importance_sampling_ratio/mean": 0.95149165391922,
      "sampling/importance_sampling_ratio/min": 0.42863741517066956,
      "sampling/sampling_logp_difference/max": 0.6258417367935181,
      "sampling/sampling_logp_difference/mean": 0.05896550416946411,
      "step": 103,
      "step_time": 9.785490112999923
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.5708712860941887,
      "epoch": 0.00104,
      "grad_norm": 0.00012892860104329884,
      "kl": 0.06115663269883953,
      "learning_rate": 7.999998288011616e-06,
      "loss": 0.0,
      "step": 104,
      "step_time": 6.934970639000312
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.4375,
      "completions/mean_terminated_length": 2.4375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8786773160099983,
      "epoch": 0.00105,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.008230560459196568,
      "kl": 0.3639178788289428,
      "learning_rate": 7.999998237288781e-06,
      "loss": -0.0001,
      "num_tokens": 2496418.0,
      "reward": 1.5750000476837158,
      "reward_std": 0.9069623351097107,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.4375,
      "rewards/probe_completion_length/std": 0.504016101360321,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.90625,
      "rewards/probe_shaping_dominance/std": 0.2961445748806,
      "rewards/probe_terminal_raw/mean": 0.09375,
      "rewards/probe_terminal_raw/std": 0.2961445748806,
      "rewards/rollout_reward_func/mean": -0.8125,
      "rewards/rollout_reward_func/std": 0.5922891497612,
      "sampling/importance_sampling_ratio/max": 1.8547699451446533,
      "sampling/importance_sampling_ratio/mean": 0.8809961080551147,
      "sampling/importance_sampling_ratio/min": 0.2464524656534195,
      "sampling/sampling_logp_difference/max": 1.2753207683563232,
      "sampling/sampling_logp_difference/mean": 0.11397504061460495,
      "step": 105,
      "step_time": 10.14871879899988
    },
    {
      "clip_ratio/high_max": 0.02500000037252903,
      "clip_ratio/high_mean": 0.012500000186264515,
      "clip_ratio/low_mean": 0.02083333395421505,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.033333334140479565,
      "entropy": 0.8598420843482018,
      "epoch": 0.00106,
      "grad_norm": 0.0009213939192704856,
      "kl": 0.3936503166332841,
      "learning_rate": 7.999998185825468e-06,
      "loss": -0.0001,
      "step": 106,
      "step_time": 6.146154757999739
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.71875,
      "completions/mean_terminated_length": 2.71875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.6813515201210976,
      "epoch": 0.00107,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.002292017685249448,
      "kl": 0.9095189813524485,
      "learning_rate": 7.999998133621676e-06,
      "loss": -0.0,
      "num_tokens": 2544489.0,
      "reward": 2.043750047683716,
      "reward_std": 1.0273478031158447,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.71875,
      "rewards/probe_completion_length/std": 0.45680341124534607,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.8125,
      "rewards/probe_shaping_dominance/std": 0.3965577781200409,
      "rewards/probe_terminal_raw/mean": 0.1875,
      "rewards/probe_terminal_raw/std": 0.3965577781200409,
      "rewards/rollout_reward_func/mean": -0.625,
      "rewards/rollout_reward_func/std": 0.7931155562400818,
      "sampling/importance_sampling_ratio/max": 1.4959931373596191,
      "sampling/importance_sampling_ratio/mean": 0.8679620623588562,
      "sampling/importance_sampling_ratio/min": 0.15595608949661255,
      "sampling/sampling_logp_difference/max": 1.701085090637207,
      "sampling/sampling_logp_difference/mean": 0.10083653032779694,
      "step": 107,
      "step_time": 11.193133051999894
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.6780059337615967,
      "epoch": 0.00108,
      "grad_norm": 0.0036851251497864723,
      "kl": 0.9937728652730584,
      "learning_rate": 7.999998080677404e-06,
      "loss": -0.0,
      "step": 108,
      "step_time": 6.075748887000373
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.46875,
      "completions/mean_terminated_length": 2.46875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.814523309469223,
      "epoch": 0.00109,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 8.425825944868848e-05,
      "kl": 0.12947031523799524,
      "learning_rate": 7.999998026992654e-06,
      "loss": 0.0,
      "num_tokens": 2594989.0,
      "reward": 1.4187500476837158,
      "reward_std": 0.507007360458374,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.46875,
      "rewards/probe_completion_length/std": 0.507007360458374,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.0389419794082642,
      "sampling/importance_sampling_ratio/mean": 0.8629075288772583,
      "sampling/importance_sampling_ratio/min": 0.5298696756362915,
      "sampling/sampling_logp_difference/max": 0.548180103302002,
      "sampling/sampling_logp_difference/mean": 0.07628992199897766,
      "step": 109,
      "step_time": 10.092573802000288
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8251436538994312,
      "epoch": 0.0011,
      "grad_norm": 0.00012063535541528836,
      "kl": 0.13848416117252782,
      "learning_rate": 7.999997972567424e-06,
      "loss": 0.0,
      "step": 110,
      "step_time": 6.0987305060002654
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.4712951257824898,
      "epoch": 0.00111,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.005738819949328899,
      "kl": 3.548261895775795,
      "learning_rate": 7.999997917401717e-06,
      "loss": -0.0001,
      "num_tokens": 2641307.0,
      "reward": 3.012500047683716,
      "reward_std": 1.014014720916748,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 2.0,
      "rewards/probe_completion_length/std": 0.0,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.46875,
      "rewards/probe_shaping_dominance/std": 0.507007360458374,
      "rewards/probe_terminal_raw/mean": 0.53125,
      "rewards/probe_terminal_raw/std": 0.507007360458374,
      "rewards/rollout_reward_func/mean": 0.0625,
      "rewards/rollout_reward_func/std": 1.014014720916748,
      "sampling/importance_sampling_ratio/max": 1.3101725578308105,
      "sampling/importance_sampling_ratio/mean": 0.8800893425941467,
      "sampling/importance_sampling_ratio/min": 0.03823943808674812,
      "sampling/sampling_logp_difference/max": 3.1835083961486816,
      "sampling/sampling_logp_difference/mean": 0.1357693374156952,
      "step": 111,
      "step_time": 10.871187624000413
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.463666919618845,
      "epoch": 0.00112,
      "grad_norm": 0.007913350127637386,
      "kl": 4.181354582309723,
      "learning_rate": 7.99999786149553e-06,
      "loss": -0.0001,
      "step": 112,
      "step_time": 5.8647342179997395
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.65625,
      "completions/mean_terminated_length": 2.65625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.6317500248551369,
      "epoch": 0.00113,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 7.839502359274775e-05,
      "kl": 0.5892274444922805,
      "learning_rate": 7.999997804848863e-06,
      "loss": 0.0,
      "num_tokens": 2688559.0,
      "reward": 2.606250047683716,
      "reward_std": 1.4052752256393433,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.65625,
      "rewards/probe_completion_length/std": 0.4825586974620819,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 1.4366681575775146,
      "sampling/importance_sampling_ratio/mean": 0.9554047584533691,
      "sampling/importance_sampling_ratio/min": 0.7517088055610657,
      "sampling/sampling_logp_difference/max": 0.6721935272216797,
      "sampling/sampling_logp_difference/mean": 0.06398595869541168,
      "step": 113,
      "step_time": 9.927891519000013
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.632821474224329,
      "epoch": 0.00114,
      "grad_norm": 7.503158849431202e-05,
      "kl": 0.5771996614057571,
      "learning_rate": 7.999997747461717e-06,
      "loss": 0.0,
      "step": 114,
      "step_time": 6.027323633999913
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 8.0,
      "completions/max_terminated_length": 8.0,
      "completions/mean_length": 2.65625,
      "completions/mean_terminated_length": 2.65625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8952479958534241,
      "epoch": 0.00115,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0037443714682012796,
      "kl": 0.4335987113881856,
      "learning_rate": 7.999997689334094e-06,
      "loss": -0.0001,
      "num_tokens": 2735416.0,
      "reward": 1.9187499284744263,
      "reward_std": 1.6161357164382935,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.71875,
      "rewards/probe_completion_length/std": 1.419549584388733,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.875,
      "rewards/probe_shaping_dominance/std": 0.33601075410842896,
      "rewards/probe_terminal_raw/mean": 0.125,
      "rewards/probe_terminal_raw/std": 0.33601075410842896,
      "rewards/rollout_reward_func/mean": -0.75,
      "rewards/rollout_reward_func/std": 0.6720215082168579,
      "sampling/importance_sampling_ratio/max": 1.2050693035125732,
      "sampling/importance_sampling_ratio/mean": 0.8156166672706604,
      "sampling/importance_sampling_ratio/min": 0.4705444872379303,
      "sampling/sampling_logp_difference/max": 0.7938392162322998,
      "sampling/sampling_logp_difference/mean": 0.10766139626502991,
      "step": 115,
      "step_time": 10.529442986999811
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.02083333395421505,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "entropy": 0.8934030309319496,
      "epoch": 0.00116,
      "grad_norm": 0.0021822690032422543,
      "kl": 0.46727226325310767,
      "learning_rate": 7.99999763046599e-06,
      "loss": -0.0001,
      "step": 116,
      "step_time": 6.006565065000359
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5.0,
      "completions/max_terminated_length": 5.0,
      "completions/mean_length": 2.59375,
      "completions/mean_terminated_length": 2.59375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.629519946873188,
      "epoch": 0.00117,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005409108707681298,
      "kl": 0.5161917522200383,
      "learning_rate": 7.999997570857409e-06,
      "loss": 0.0,
      "num_tokens": 2780743.0,
      "reward": 2.043750047683716,
      "reward_std": 1.2790968418121338,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.59375,
      "rewards/probe_completion_length/std": 0.6652370095252991,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.601353645324707,
      "sampling/importance_sampling_ratio/mean": 0.8943063020706177,
      "sampling/importance_sampling_ratio/min": 0.17391638457775116,
      "sampling/sampling_logp_difference/max": 1.632702112197876,
      "sampling/sampling_logp_difference/mean": 0.10117563605308533,
      "step": 117,
      "step_time": 10.079833390999966
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.6213446594774723,
      "epoch": 0.00118,
      "grad_norm": 0.0004654082003980875,
      "kl": 0.48489440116100013,
      "learning_rate": 7.999997510508348e-06,
      "loss": 0.0,
      "step": 118,
      "step_time": 6.549988884000413
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 8.0,
      "completions/max_terminated_length": 8.0,
      "completions/mean_length": 2.8125,
      "completions/mean_terminated_length": 2.8125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.7453503087162971,
      "epoch": 0.00119,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00014552050561178476,
      "kl": 0.22149313415866345,
      "learning_rate": 7.999997449418809e-06,
      "loss": 0.0,
      "num_tokens": 2825585.0,
      "reward": 2.325000047683716,
      "reward_std": 1.6800537109375,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 1.38540780544281,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.91416597366333,
      "sampling/importance_sampling_ratio/mean": 0.9072912335395813,
      "sampling/importance_sampling_ratio/min": 0.445117324590683,
      "sampling/sampling_logp_difference/max": 0.7076725959777832,
      "sampling/sampling_logp_difference/mean": 0.08132369816303253,
      "step": 119,
      "step_time": 10.666148662000296
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.7450893074274063,
      "epoch": 0.0012,
      "grad_norm": 0.00017443943943362683,
      "kl": 0.23418569331988692,
      "learning_rate": 7.99999738758879e-06,
      "loss": 0.0,
      "step": 120,
      "step_time": 6.12022116199978
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.8125,
      "completions/mean_terminated_length": 2.8125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.46727633848786354,
      "epoch": 0.00121,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.003154071979224682,
      "kl": 1.0851212590932846,
      "learning_rate": 7.999997325018293e-06,
      "loss": -0.0001,
      "num_tokens": 2869873.0,
      "reward": 2.637500047683716,
      "reward_std": 1.2296733856201172,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.8125,
      "rewards/probe_completion_length/std": 0.3965577781200409,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5625,
      "rewards/probe_shaping_dominance/std": 0.504016101360321,
      "rewards/probe_terminal_raw/mean": 0.4375,
      "rewards/probe_terminal_raw/std": 0.504016101360321,
      "rewards/rollout_reward_func/mean": -0.125,
      "rewards/rollout_reward_func/std": 1.008032202720642,
      "sampling/importance_sampling_ratio/max": 1.017763614654541,
      "sampling/importance_sampling_ratio/mean": 0.8756238222122192,
      "sampling/importance_sampling_ratio/min": 0.3428362011909485,
      "sampling/sampling_logp_difference/max": 1.0764925479888916,
      "sampling/sampling_logp_difference/mean": 0.06550387293100357,
      "step": 121,
      "step_time": 9.955963402999714
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.45788635686039925,
      "epoch": 0.00122,
      "grad_norm": 0.0027381964027881622,
      "kl": 1.146961946040392,
      "learning_rate": 7.999997261707317e-06,
      "loss": -0.0001,
      "step": 122,
      "step_time": 6.473173019000114
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.40625,
      "completions/mean_terminated_length": 2.40625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.0169628709554672,
      "epoch": 0.00123,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00026922833058051765,
      "kl": 0.24311537444009446,
      "learning_rate": 7.999997197655861e-06,
      "loss": 0.0,
      "num_tokens": 2922019.0,
      "reward": 1.3562500476837158,
      "reward_std": 0.498990923166275,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.40625,
      "rewards/probe_completion_length/std": 0.49899089336395264,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.919534683227539,
      "sampling/importance_sampling_ratio/mean": 0.8465455174446106,
      "sampling/importance_sampling_ratio/min": 0.2150496542453766,
      "sampling/sampling_logp_difference/max": 1.3104627132415771,
      "sampling/sampling_logp_difference/mean": 0.16094987094402313,
      "step": 123,
      "step_time": 10.31892864700012
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.0269659757614136,
      "epoch": 0.00124,
      "grad_norm": 0.0002643872576300055,
      "kl": 0.2424255390651524,
      "learning_rate": 7.999997132863928e-06,
      "loss": 0.0,
      "step": 124,
      "step_time": 6.228070854999714
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.75,
      "completions/mean_terminated_length": 2.75,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.567756149917841,
      "epoch": 0.00125,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 1.2914318176626693e-05,
      "kl": 0.0012493410226852575,
      "learning_rate": 7.999997067331516e-06,
      "loss": 0.0,
      "num_tokens": 2961809.0,
      "reward": 3.200000047683716,
      "reward_std": 1.319823980331421,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.75,
      "rewards/probe_completion_length/std": 0.4399413466453552,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.25,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.75,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": 0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 0.9578561782836914,
      "sampling/importance_sampling_ratio/mean": 0.8919152021408081,
      "sampling/importance_sampling_ratio/min": 0.5368828177452087,
      "sampling/sampling_logp_difference/max": 0.41883519291877747,
      "sampling/sampling_logp_difference/mean": 0.04870132356882095,
      "step": 125,
      "step_time": 9.758941899000092
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.5752867646515369,
      "epoch": 0.00126,
      "grad_norm": 9.876180229184683e-06,
      "kl": 0.0008715322792340885,
      "learning_rate": 7.999997001058625e-06,
      "loss": 0.0,
      "step": 126,
      "step_time": 6.379213361999746
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.65625,
      "completions/mean_terminated_length": 2.65625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.5773117579519749,
      "epoch": 0.00127,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.0019505616510286927,
      "kl": 1.3100847490131855,
      "learning_rate": 7.999996934045254e-06,
      "loss": -0.0,
      "num_tokens": 3007241.0,
      "reward": 2.481250047683716,
      "reward_std": 1.3674646615982056,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.65625,
      "rewards/probe_completion_length/std": 0.4825586974620819,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5625,
      "rewards/probe_shaping_dominance/std": 0.504016101360321,
      "rewards/probe_terminal_raw/mean": 0.4375,
      "rewards/probe_terminal_raw/std": 0.504016101360321,
      "rewards/rollout_reward_func/mean": -0.125,
      "rewards/rollout_reward_func/std": 1.008032202720642,
      "sampling/importance_sampling_ratio/max": 1.2050573825836182,
      "sampling/importance_sampling_ratio/mean": 0.9187490940093994,
      "sampling/importance_sampling_ratio/min": 0.21540533006191254,
      "sampling/sampling_logp_difference/max": 1.124786615371704,
      "sampling/sampling_logp_difference/mean": 0.08163493126630783,
      "step": 127,
      "step_time": 10.438749245999361
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.5682215988636017,
      "epoch": 0.00128,
      "grad_norm": 0.0021813081111758947,
      "kl": 1.3469352908432484,
      "learning_rate": 7.999996866291406e-06,
      "loss": -0.0,
      "step": 128,
      "step_time": 5.971626152999988
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.625,
      "completions/mean_terminated_length": 2.625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.643170677125454,
      "epoch": 0.00129,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00019623831030912697,
      "kl": 0.6019693883135915,
      "learning_rate": 7.999996797797079e-06,
      "loss": 0.0,
      "num_tokens": 3058344.0,
      "reward": 1.5750000476837158,
      "reward_std": 0.49186936020851135,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.625,
      "rewards/probe_completion_length/std": 0.49186936020851135,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 2.0693564414978027,
      "sampling/importance_sampling_ratio/mean": 0.9278706908226013,
      "sampling/importance_sampling_ratio/min": 0.32946839928627014,
      "sampling/sampling_logp_difference/max": 1.113986611366272,
      "sampling/sampling_logp_difference/mean": 0.08836956322193146,
      "step": 129,
      "step_time": 10.049561713999992
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.6441235989332199,
      "epoch": 0.0013,
      "grad_norm": 0.00011418814392527565,
      "kl": 0.5788391289788706,
      "learning_rate": 7.999996728562273e-06,
      "loss": 0.0,
      "step": 130,
      "step_time": 6.564315318000126
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5625,
      "completions/mean_terminated_length": 2.5625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.6178856827318668,
      "epoch": 0.00131,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 6.785019650124013e-05,
      "kl": 0.7463820744305849,
      "learning_rate": 7.999996658586989e-06,
      "loss": 0.0,
      "num_tokens": 3104192.0,
      "reward": 2.043750047683716,
      "reward_std": 1.201058030128479,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5625,
      "rewards/probe_completion_length/std": 0.504016101360321,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.4856263399124146,
      "sampling/importance_sampling_ratio/mean": 0.9183324575424194,
      "sampling/importance_sampling_ratio/min": 0.5824882984161377,
      "sampling/sampling_logp_difference/max": 0.4804893732070923,
      "sampling/sampling_logp_difference/mean": 0.06271836906671524,
      "step": 131,
      "step_time": 10.31453646
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.6255888566374779,
      "epoch": 0.00132,
      "grad_norm": 7.66810990171507e-05,
      "kl": 0.7543911002576351,
      "learning_rate": 7.999996587871225e-06,
      "loss": 0.0,
      "step": 132,
      "step_time": 5.922291539999378
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.29608798772096634,
      "epoch": 0.00133,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.023734381422400475,
      "kl": 7.532367415726185,
      "learning_rate": 7.999996516414982e-06,
      "loss": 0.0,
      "num_tokens": 3145988.0,
      "reward": 3.762500047683716,
      "reward_std": 0.5922891497612,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 2.0,
      "rewards/probe_completion_length/std": 0.0,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.09375,
      "rewards/probe_shaping_dominance/std": 0.2961445748806,
      "rewards/probe_terminal_raw/mean": 0.90625,
      "rewards/probe_terminal_raw/std": 0.2961445748806,
      "rewards/rollout_reward_func/mean": 0.8125,
      "rewards/rollout_reward_func/std": 0.5922891497612,
      "sampling/importance_sampling_ratio/max": 1.0541738271713257,
      "sampling/importance_sampling_ratio/mean": 0.9211882948875427,
      "sampling/importance_sampling_ratio/min": 0.12494106590747833,
      "sampling/sampling_logp_difference/max": 1.975663185119629,
      "sampling/sampling_logp_difference/mean": 0.044852159917354584,
      "step": 133,
      "step_time": 10.01835433600013
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.296497218310833,
      "epoch": 0.00134,
      "grad_norm": 0.00268841371871531,
      "kl": 2.1114935651421547,
      "learning_rate": 7.999996444218262e-06,
      "loss": -0.0,
      "step": 134,
      "step_time": 5.628559854000287
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.3125,
      "completions/mean_terminated_length": 2.3125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.1430947929620743,
      "epoch": 0.00135,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00015971790708135813,
      "kl": 0.15770327649079263,
      "learning_rate": 7.999996371281063e-06,
      "loss": 0.0,
      "num_tokens": 3197833.0,
      "reward": 1.2625000476837158,
      "reward_std": 0.4709290862083435,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.3125,
      "rewards/probe_completion_length/std": 0.4709290862083435,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.2785383462905884,
      "sampling/importance_sampling_ratio/mean": 0.8237912654876709,
      "sampling/importance_sampling_ratio/min": 0.3784085810184479,
      "sampling/sampling_logp_difference/max": 0.7751260995864868,
      "sampling/sampling_logp_difference/mean": 0.12488892674446106,
      "step": 135,
      "step_time": 10.539870797000276
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.1564906761050224,
      "epoch": 0.00136,
      "grad_norm": 0.00014199405268300325,
      "kl": 0.14654893381521106,
      "learning_rate": 7.999996297603385e-06,
      "loss": 0.0,
      "step": 136,
      "step_time": 6.1141825849999805
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.65625,
      "completions/mean_terminated_length": 2.65625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.6290507353842258,
      "epoch": 0.00137,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 8.231111860368401e-05,
      "kl": 0.574861828237772,
      "learning_rate": 7.999996223185228e-06,
      "loss": 0.0,
      "num_tokens": 3245445.0,
      "reward": 2.606250047683716,
      "reward_std": 1.4052752256393433,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.65625,
      "rewards/probe_completion_length/std": 0.4825586974620819,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 1.1159268617630005,
      "sampling/importance_sampling_ratio/mean": 0.8925962448120117,
      "sampling/importance_sampling_ratio/min": 0.41961097717285156,
      "sampling/sampling_logp_difference/max": 0.7973945140838623,
      "sampling/sampling_logp_difference/mean": 0.07273562997579575,
      "step": 137,
      "step_time": 10.542406992999759
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.635385774075985,
      "epoch": 0.00138,
      "grad_norm": 6.592683348571882e-05,
      "kl": 0.5743689931114204,
      "learning_rate": 7.999996148026594e-06,
      "loss": 0.0,
      "step": 138,
      "step_time": 6.578659082000513
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.625,
      "completions/mean_terminated_length": 2.625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.7651711776852608,
      "epoch": 0.00139,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0001584794808877632,
      "kl": 0.5461679399013519,
      "learning_rate": 7.99999607212748e-06,
      "loss": 0.0,
      "num_tokens": 3295589.0,
      "reward": 1.5750000476837158,
      "reward_std": 0.49186936020851135,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.625,
      "rewards/probe_completion_length/std": 0.49186936020851135,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.483282208442688,
      "sampling/importance_sampling_ratio/mean": 0.9177201986312866,
      "sampling/importance_sampling_ratio/min": 0.32641756534576416,
      "sampling/sampling_logp_difference/max": 0.6357085704803467,
      "sampling/sampling_logp_difference/mean": 0.08419822156429291,
      "step": 139,
      "step_time": 10.432583765000118
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8008819669485092,
      "epoch": 0.0014,
      "grad_norm": 0.00021447367907967418,
      "kl": 0.5468537542037666,
      "learning_rate": 7.999995995487888e-06,
      "loss": 0.0,
      "step": 140,
      "step_time": 6.285205011999778
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.71875,
      "completions/mean_terminated_length": 2.71875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.6883749663829803,
      "epoch": 0.00141,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0016245342558249831,
      "kl": 2.113200404914096,
      "learning_rate": 7.999995918107818e-06,
      "loss": -0.0,
      "num_tokens": 3342865.0,
      "reward": 2.606250047683716,
      "reward_std": 1.334634780883789,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.71875,
      "rewards/probe_completion_length/std": 0.45680341124534607,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.53125,
      "rewards/probe_shaping_dominance/std": 0.507007360458374,
      "rewards/probe_terminal_raw/mean": 0.46875,
      "rewards/probe_terminal_raw/std": 0.507007360458374,
      "rewards/rollout_reward_func/mean": -0.0625,
      "rewards/rollout_reward_func/std": 1.014014720916748,
      "sampling/importance_sampling_ratio/max": 1.1353306770324707,
      "sampling/importance_sampling_ratio/mean": 0.8638892769813538,
      "sampling/importance_sampling_ratio/min": 0.27192848920822144,
      "sampling/sampling_logp_difference/max": 1.2334420680999756,
      "sampling/sampling_logp_difference/mean": 0.08482334017753601,
      "step": 141,
      "step_time": 10.48996880799973
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.6942117437720299,
      "epoch": 0.00142,
      "grad_norm": 0.0024598841555416584,
      "kl": 2.2772381571121514,
      "learning_rate": 7.999995839987269e-06,
      "loss": -0.0,
      "step": 142,
      "step_time": 6.0210988920000545
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.09375,
      "completions/mean_terminated_length": 2.09375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.2067401632666588,
      "epoch": 0.00143,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00035976560320705175,
      "kl": 0.1048080543987453,
      "learning_rate": 7.999995761126243e-06,
      "loss": 0.0,
      "num_tokens": 3391495.0,
      "reward": 1.0437500476837158,
      "reward_std": 0.29614460468292236,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.09375,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.1019096374511719,
      "sampling/importance_sampling_ratio/mean": 0.861067533493042,
      "sampling/importance_sampling_ratio/min": 0.5161373019218445,
      "sampling/sampling_logp_difference/max": 0.41501492261886597,
      "sampling/sampling_logp_difference/mean": 0.09741053730249405,
      "step": 143,
      "step_time": 10.409878404999972
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.2020705938339233,
      "epoch": 0.00144,
      "grad_norm": 0.00014758903125766665,
      "kl": 0.09721950892708264,
      "learning_rate": 7.999995681524736e-06,
      "loss": 0.0,
      "step": 144,
      "step_time": 6.574293671999612
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.8125,
      "completions/mean_terminated_length": 2.8125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8218728005886078,
      "epoch": 0.00145,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00010783881589304656,
      "kl": 0.5577464867383242,
      "learning_rate": 7.999995601182752e-06,
      "loss": 0.0,
      "num_tokens": 3438637.0,
      "reward": 2.262500047683716,
      "reward_std": 1.0606601238250732,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.8125,
      "rewards/probe_completion_length/std": 0.3965577781200409,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.0564521551132202,
      "sampling/importance_sampling_ratio/mean": 0.79786217212677,
      "sampling/importance_sampling_ratio/min": 0.0,
      "sampling/sampling_logp_difference/max": 1.278409481048584,
      "sampling/sampling_logp_difference/mean": 0.10636921972036362,
      "step": 145,
      "step_time": 10.219195225000021
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8261116743087769,
      "epoch": 0.00146,
      "grad_norm": 9.589105320628732e-05,
      "kl": 0.5436595194041729,
      "learning_rate": 7.999995520100289e-06,
      "loss": 0.0,
      "step": 146,
      "step_time": 6.632304521999458
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.65625,
      "completions/mean_terminated_length": 2.65625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.662704985588789,
      "epoch": 0.00147,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 2.6694644475355744e-05,
      "kl": 0.5263466238975525,
      "learning_rate": 7.999995438277348e-06,
      "loss": 0.0,
      "num_tokens": 3482380.0,
      "reward": 2.606250047683716,
      "reward_std": 1.4052752256393433,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.65625,
      "rewards/probe_completion_length/std": 0.4825586974620819,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 1.115870475769043,
      "sampling/importance_sampling_ratio/mean": 0.9159581661224365,
      "sampling/importance_sampling_ratio/min": 0.6554316282272339,
      "sampling/sampling_logp_difference/max": 0.27883535623550415,
      "sampling/sampling_logp_difference/mean": 0.047572724521160126,
      "step": 147,
      "step_time": 9.820527543000026
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.6697583869099617,
      "epoch": 0.00148,
      "grad_norm": 2.685715298866853e-05,
      "kl": 0.5248434137902223,
      "learning_rate": 7.99999535571393e-06,
      "loss": 0.0,
      "step": 148,
      "step_time": 5.959111133999841
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.3125,
      "completions/mean_terminated_length": 2.3125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8875343278050423,
      "epoch": 0.00149,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 5.001196768716909e-05,
      "kl": 0.0462939627468586,
      "learning_rate": 7.999995272410032e-06,
      "loss": 0.0,
      "num_tokens": 3528001.0,
      "reward": 1.7625000476837158,
      "reward_std": 1.3060035705566406,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.3125,
      "rewards/probe_completion_length/std": 0.4709290862083435,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.5503108501434326,
      "sampling/importance_sampling_ratio/mean": 0.8569838404655457,
      "sampling/importance_sampling_ratio/min": 0.38676348328590393,
      "sampling/sampling_logp_difference/max": 0.8490326404571533,
      "sampling/sampling_logp_difference/mean": 0.10086904466152191,
      "step": 149,
      "step_time": 10.409084240000084
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.885762520134449,
      "epoch": 0.0015,
      "grad_norm": 4.54061409982387e-05,
      "kl": 0.0408337813714752,
      "learning_rate": 7.999995188365656e-06,
      "loss": 0.0,
      "step": 150,
      "step_time": 6.53108840799996
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.40625,
      "completions/mean_terminated_length": 2.40625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9287142679095268,
      "epoch": 0.00151,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 2.5531895516905934e-05,
      "kl": 0.08757305558538064,
      "learning_rate": 7.999995103580802e-06,
      "loss": 0.0,
      "num_tokens": 3577744.0,
      "reward": 1.3562500476837158,
      "reward_std": 0.498990923166275,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.40625,
      "rewards/probe_completion_length/std": 0.49899089336395264,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.1747065782546997,
      "sampling/importance_sampling_ratio/mean": 0.8980759382247925,
      "sampling/importance_sampling_ratio/min": 0.6533486247062683,
      "sampling/sampling_logp_difference/max": 0.3130163550376892,
      "sampling/sampling_logp_difference/mean": 0.07302668690681458,
      "step": 151,
      "step_time": 10.05969518300003
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.9390483722090721,
      "epoch": 0.00152,
      "grad_norm": 2.0379042325657792e-05,
      "kl": 0.08184675365919247,
      "learning_rate": 7.99999501805547e-06,
      "loss": 0.0,
      "step": 152,
      "step_time": 6.642018062000034
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.4375,
      "completions/mean_terminated_length": 2.4375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9555944502353668,
      "epoch": 0.00153,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 5.609483559965156e-05,
      "kl": 0.04718830151250586,
      "learning_rate": 7.99999493178966e-06,
      "loss": 0.0,
      "num_tokens": 3622932.0,
      "reward": 1.3875000476837158,
      "reward_std": 0.504016101360321,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.4375,
      "rewards/probe_completion_length/std": 0.504016101360321,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.049941062927246,
      "sampling/importance_sampling_ratio/mean": 0.8323773145675659,
      "sampling/importance_sampling_ratio/min": 0.476598858833313,
      "sampling/sampling_logp_difference/max": 0.45580363273620605,
      "sampling/sampling_logp_difference/mean": 0.10586726665496826,
      "step": 153,
      "step_time": 10.33890006899992
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.9704329445958138,
      "epoch": 0.00154,
      "grad_norm": 4.737279596156441e-05,
      "kl": 0.04786970978602767,
      "learning_rate": 7.99999484478337e-06,
      "loss": 0.0,
      "step": 154,
      "step_time": 5.966872542999681
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5625,
      "completions/mean_terminated_length": 2.5625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.7249070331454277,
      "epoch": 0.00155,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.00405209232121706,
      "kl": 0.4083508696348872,
      "learning_rate": 7.999994757036603e-06,
      "loss": -0.0001,
      "num_tokens": 3671748.0,
      "reward": 1.9500000476837158,
      "reward_std": 1.1639753580093384,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5625,
      "rewards/probe_completion_length/std": 0.504016101360321,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.78125,
      "rewards/probe_shaping_dominance/std": 0.420013427734375,
      "rewards/probe_terminal_raw/mean": 0.21875,
      "rewards/probe_terminal_raw/std": 0.420013427734375,
      "rewards/rollout_reward_func/mean": -0.5625,
      "rewards/rollout_reward_func/std": 0.84002685546875,
      "sampling/importance_sampling_ratio/max": 1.0744291543960571,
      "sampling/importance_sampling_ratio/mean": 0.8961950540542603,
      "sampling/importance_sampling_ratio/min": 0.4760172367095947,
      "sampling/sampling_logp_difference/max": 0.604301929473877,
      "sampling/sampling_logp_difference/mean": 0.06410501897335052,
      "step": 155,
      "step_time": 10.121234842999684
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.7260234132409096,
      "epoch": 0.00156,
      "grad_norm": 0.0011063937563449144,
      "kl": 0.4271300343389157,
      "learning_rate": 7.999994668549356e-06,
      "loss": -0.0001,
      "step": 156,
      "step_time": 6.629027390999681
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.46875,
      "completions/mean_terminated_length": 2.46875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.0310870260000229,
      "epoch": 0.00157,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0030120075680315495,
      "kl": 0.20033146999776363,
      "learning_rate": 7.999994579321633e-06,
      "loss": -0.0,
      "num_tokens": 3723638.0,
      "reward": 1.4812500476837158,
      "reward_std": 0.671271026134491,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.46875,
      "rewards/probe_completion_length/std": 0.507007360458374,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.96875,
      "rewards/probe_shaping_dominance/std": 0.1767766922712326,
      "rewards/probe_terminal_raw/mean": 0.03125,
      "rewards/probe_terminal_raw/std": 0.1767766922712326,
      "rewards/rollout_reward_func/mean": -0.9375,
      "rewards/rollout_reward_func/std": 0.3535533845424652,
      "sampling/importance_sampling_ratio/max": 1.8720694780349731,
      "sampling/importance_sampling_ratio/mean": 0.896269679069519,
      "sampling/importance_sampling_ratio/min": 0.544737696647644,
      "sampling/sampling_logp_difference/max": 0.7200915813446045,
      "sampling/sampling_logp_difference/mean": 0.10810667276382446,
      "step": 157,
      "step_time": 10.497650911999926
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 1.0275204181671143,
      "epoch": 0.00158,
      "grad_norm": 0.002701184945181012,
      "kl": 0.20881290454417467,
      "learning_rate": 7.999994489353432e-06,
      "loss": -0.0,
      "step": 158,
      "step_time": 6.045203006000065
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 8.0,
      "completions/max_terminated_length": 8.0,
      "completions/mean_length": 2.75,
      "completions/mean_terminated_length": 2.75,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.7524381056427956,
      "epoch": 0.00159,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 4.496976180234924e-05,
      "kl": 0.02527153951814398,
      "learning_rate": 7.999994398644752e-06,
      "loss": 0.0,
      "num_tokens": 3766015.0,
      "reward": 2.2624998092651367,
      "reward_std": 1.7121481895446777,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.8125,
      "rewards/probe_completion_length/std": 1.4013242721557617,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.0045019388198853,
      "sampling/importance_sampling_ratio/mean": 0.8471688032150269,
      "sampling/importance_sampling_ratio/min": 0.5022687315940857,
      "sampling/sampling_logp_difference/max": 0.5595995187759399,
      "sampling/sampling_logp_difference/mean": 0.06900060176849365,
      "step": 159,
      "step_time": 9.84526067000047
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.744699664413929,
      "epoch": 0.0016,
      "grad_norm": 4.002218338428065e-05,
      "kl": 0.0263721200872169,
      "learning_rate": 7.999994307195594e-06,
      "loss": 0.0,
      "step": 160,
      "step_time": 6.48367115699989
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.45632531866431236,
      "epoch": 0.00161,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 2.4971124730654992e-05,
      "kl": 0.5871615024880157,
      "learning_rate": 7.99999421500596e-06,
      "loss": 0.0,
      "num_tokens": 3809262.0,
      "reward": 3.356250047683716,
      "reward_std": 1.0734140872955322,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.25,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.75,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": 0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 0.9640352725982666,
      "sampling/importance_sampling_ratio/mean": 0.9091618061065674,
      "sampling/importance_sampling_ratio/min": 0.5254960060119629,
      "sampling/sampling_logp_difference/max": 0.30299806594848633,
      "sampling/sampling_logp_difference/mean": 0.037094734609127045,
      "step": 161,
      "step_time": 10.210792510000601
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.45290883257985115,
      "epoch": 0.00162,
      "grad_norm": 2.5933706638170406e-05,
      "kl": 0.5865964656695724,
      "learning_rate": 7.999994122075845e-06,
      "loss": 0.0,
      "step": 162,
      "step_time": 5.857546231000242
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.25,
      "completions/mean_terminated_length": 2.25,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.152751550078392,
      "epoch": 0.00163,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 6.958712037885562e-05,
      "kl": 0.07499606953933835,
      "learning_rate": 7.999994028405253e-06,
      "loss": 0.0,
      "num_tokens": 3856949.0,
      "reward": 1.2000000476837158,
      "reward_std": 0.4399413466453552,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.25,
      "rewards/probe_completion_length/std": 0.4399413466453552,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.1793038845062256,
      "sampling/importance_sampling_ratio/mean": 0.8708353042602539,
      "sampling/importance_sampling_ratio/min": 0.4095854163169861,
      "sampling/sampling_logp_difference/max": 0.46399402618408203,
      "sampling/sampling_logp_difference/mean": 0.10172684490680695,
      "step": 163,
      "step_time": 10.000568296000438
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.1484008505940437,
      "epoch": 0.00164,
      "grad_norm": 9.33783157961443e-05,
      "kl": 0.0766537832969334,
      "learning_rate": 7.999993933994183e-06,
      "loss": 0.0,
      "step": 164,
      "step_time": 6.573066442999789
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.65625,
      "completions/mean_terminated_length": 2.65625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8715492784976959,
      "epoch": 0.00165,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.001674352795816958,
      "kl": 0.38867083785589784,
      "learning_rate": 7.999993838842636e-06,
      "loss": -0.0,
      "num_tokens": 3906527.0,
      "reward": 1.7937500476837158,
      "reward_std": 0.8466013669967651,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.65625,
      "rewards/probe_completion_length/std": 0.4825586974620819,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.90625,
      "rewards/probe_shaping_dominance/std": 0.2961445748806,
      "rewards/probe_terminal_raw/mean": 0.09375,
      "rewards/probe_terminal_raw/std": 0.2961445748806,
      "rewards/rollout_reward_func/mean": -0.8125,
      "rewards/rollout_reward_func/std": 0.5922891497612,
      "sampling/importance_sampling_ratio/max": 1.153779149055481,
      "sampling/importance_sampling_ratio/mean": 0.8230181336402893,
      "sampling/importance_sampling_ratio/min": 0.3351452946662903,
      "sampling/sampling_logp_difference/max": 0.924765944480896,
      "sampling/sampling_logp_difference/mean": 0.10521954298019409,
      "step": 165,
      "step_time": 10.572739006000347
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.8823190033435822,
      "epoch": 0.00166,
      "grad_norm": 0.0011349389096722007,
      "kl": 0.3862094555515796,
      "learning_rate": 7.99999374295061e-06,
      "loss": -0.0,
      "step": 166,
      "step_time": 6.040446489000033
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.625,
      "completions/mean_terminated_length": 2.625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9422005489468575,
      "epoch": 0.00167,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0008965503075160086,
      "kl": 0.3014809291344136,
      "learning_rate": 7.999993646318106e-06,
      "loss": -0.0,
      "num_tokens": 3954615.0,
      "reward": 1.7625000476837158,
      "reward_std": 0.8590129613876343,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.625,
      "rewards/probe_completion_length/std": 0.49186936020851135,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.90625,
      "rewards/probe_shaping_dominance/std": 0.2961445748806,
      "rewards/probe_terminal_raw/mean": 0.09375,
      "rewards/probe_terminal_raw/std": 0.2961445748806,
      "rewards/rollout_reward_func/mean": -0.8125,
      "rewards/rollout_reward_func/std": 0.5922891497612,
      "sampling/importance_sampling_ratio/max": 1.2705289125442505,
      "sampling/importance_sampling_ratio/mean": 0.8543938994407654,
      "sampling/importance_sampling_ratio/min": 0.325277715921402,
      "sampling/sampling_logp_difference/max": 0.7510074377059937,
      "sampling/sampling_logp_difference/mean": 0.1064271330833435,
      "step": 167,
      "step_time": 9.966102381999917
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.9336102455854416,
      "epoch": 0.00168,
      "grad_norm": 0.0007681242423132062,
      "kl": 0.29743986390531063,
      "learning_rate": 7.999993548945123e-06,
      "loss": -0.0,
      "step": 168,
      "step_time": 6.981794994000211
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 2.53125,
      "completions/mean_terminated_length": 2.53125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8915975391864777,
      "epoch": 0.00169,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0011564207961782813,
      "kl": 0.25107664382085204,
      "learning_rate": 7.999993450831664e-06,
      "loss": -0.0,
      "num_tokens": 4001506.0,
      "reward": 1.8250000476837158,
      "reward_std": 1.31369948387146,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5625,
      "rewards/probe_completion_length/std": 0.9482581615447998,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.84375,
      "rewards/probe_shaping_dominance/std": 0.3689020276069641,
      "rewards/probe_terminal_raw/mean": 0.15625,
      "rewards/probe_terminal_raw/std": 0.3689020276069641,
      "rewards/rollout_reward_func/mean": -0.6875,
      "rewards/rollout_reward_func/std": 0.7378040552139282,
      "sampling/importance_sampling_ratio/max": 1.938115119934082,
      "sampling/importance_sampling_ratio/mean": 0.9251878261566162,
      "sampling/importance_sampling_ratio/min": 0.5063843727111816,
      "sampling/sampling_logp_difference/max": 0.7367093563079834,
      "sampling/sampling_logp_difference/mean": 0.08683702349662781,
      "step": 169,
      "step_time": 9.905912688000399
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8912236839532852,
      "epoch": 0.0017,
      "grad_norm": 0.0012997838202863932,
      "kl": 0.2457849751226604,
      "learning_rate": 7.999993351977727e-06,
      "loss": -0.0,
      "step": 170,
      "step_time": 6.000670364999905
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.4375,
      "completions/mean_terminated_length": 2.4375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.836796797811985,
      "epoch": 0.00171,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 8.854575571604073e-05,
      "kl": 0.06441712872037897,
      "learning_rate": 7.999993252383311e-06,
      "loss": 0.0,
      "num_tokens": 4049332.0,
      "reward": 1.8875000476837158,
      "reward_std": 1.2684128284454346,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.4375,
      "rewards/probe_completion_length/std": 0.504016101360321,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.1459448337554932,
      "sampling/importance_sampling_ratio/mean": 0.8837835788726807,
      "sampling/importance_sampling_ratio/min": 0.5503401160240173,
      "sampling/sampling_logp_difference/max": 0.2968907952308655,
      "sampling/sampling_logp_difference/mean": 0.06938318908214569,
      "step": 171,
      "step_time": 10.424372221999874
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8422264903783798,
      "epoch": 0.00172,
      "grad_norm": 6.102959014242515e-05,
      "kl": 0.05693867025547661,
      "learning_rate": 7.999993152048418e-06,
      "loss": 0.0,
      "step": 172,
      "step_time": 6.742195677000609
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.8125,
      "completions/mean_terminated_length": 2.8125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.5464613921940327,
      "epoch": 0.00173,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00066503876587376,
      "kl": 0.9290937134064734,
      "learning_rate": 7.999993050973047e-06,
      "loss": 0.0,
      "num_tokens": 4094304.0,
      "reward": 2.262500047683716,
      "reward_std": 1.0606601238250732,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.8125,
      "rewards/probe_completion_length/std": 0.3965577781200409,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.245759129524231,
      "sampling/importance_sampling_ratio/mean": 0.8740517497062683,
      "sampling/importance_sampling_ratio/min": 0.12748770415782928,
      "sampling/sampling_logp_difference/max": 1.9313684701919556,
      "sampling/sampling_logp_difference/mean": 0.08258374035358429,
      "step": 173,
      "step_time": 10.055813986999965
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.5519991889595985,
      "epoch": 0.00174,
      "grad_norm": 0.0006888221250846982,
      "kl": 0.9350114449625835,
      "learning_rate": 7.999992949157197e-06,
      "loss": 0.0,
      "step": 174,
      "step_time": 5.969105376000243
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.34375,
      "completions/mean_terminated_length": 2.34375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9456525817513466,
      "epoch": 0.00175,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0010870946571230888,
      "kl": 0.579621426644735,
      "learning_rate": 7.999992846600872e-06,
      "loss": -0.0,
      "num_tokens": 4143282.0,
      "reward": 1.7312500476837158,
      "reward_std": 1.2374368906021118,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.34375,
      "rewards/probe_completion_length/std": 0.4825586974620819,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.78125,
      "rewards/probe_shaping_dominance/std": 0.420013427734375,
      "rewards/probe_terminal_raw/mean": 0.21875,
      "rewards/probe_terminal_raw/std": 0.420013427734375,
      "rewards/rollout_reward_func/mean": -0.5625,
      "rewards/rollout_reward_func/std": 0.84002685546875,
      "sampling/importance_sampling_ratio/max": 1.0026894807815552,
      "sampling/importance_sampling_ratio/mean": 0.8730309009552002,
      "sampling/importance_sampling_ratio/min": 0.5163137316703796,
      "sampling/sampling_logp_difference/max": 0.3348468840122223,
      "sampling/sampling_logp_difference/mean": 0.07812627404928207,
      "step": 175,
      "step_time": 10.401969632999453
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.9536789208650589,
      "epoch": 0.00176,
      "grad_norm": 0.001607311423867941,
      "kl": 0.5831765849143267,
      "learning_rate": 7.999992743304069e-06,
      "loss": -0.0,
      "step": 176,
      "step_time": 6.50468187199931
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.40625,
      "completions/mean_terminated_length": 2.40625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8540512844920158,
      "epoch": 0.00177,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 5.113742736284621e-05,
      "kl": 0.5639435993507504,
      "learning_rate": 7.999992639266786e-06,
      "loss": 0.0,
      "num_tokens": 4188525.0,
      "reward": 1.8562500476837158,
      "reward_std": 1.2790968418121338,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.40625,
      "rewards/probe_completion_length/std": 0.49899089336395264,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.5230607986450195,
      "sampling/importance_sampling_ratio/mean": 0.9213600754737854,
      "sampling/importance_sampling_ratio/min": 0.5386036038398743,
      "sampling/sampling_logp_difference/max": 0.5391008853912354,
      "sampling/sampling_logp_difference/mean": 0.07177218794822693,
      "step": 177,
      "step_time": 9.79140261700013
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8499638438224792,
      "epoch": 0.00178,
      "grad_norm": 5.0212442147312686e-05,
      "kl": 0.5627318718470633,
      "learning_rate": 7.999992534489026e-06,
      "loss": 0.0,
      "step": 178,
      "step_time": 5.927821066999968
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.15625,
      "completions/mean_terminated_length": 2.15625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.19364695250988,
      "epoch": 0.00179,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000155127709149383,
      "kl": 0.05882764467969537,
      "learning_rate": 7.99999242897079e-06,
      "loss": 0.0,
      "num_tokens": 4239646.0,
      "reward": 1.1062500476837158,
      "reward_std": 0.3689020574092865,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.15625,
      "rewards/probe_completion_length/std": 0.3689020276069641,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.3667725324630737,
      "sampling/importance_sampling_ratio/mean": 0.8795427680015564,
      "sampling/importance_sampling_ratio/min": 0.3527143597602844,
      "sampling/sampling_logp_difference/max": 0.6183974742889404,
      "sampling/sampling_logp_difference/mean": 0.1178746223449707,
      "step": 179,
      "step_time": 10.46284436499991
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.1964821219444275,
      "epoch": 0.0018,
      "grad_norm": 6.373682845151052e-05,
      "kl": 0.04784144414588809,
      "learning_rate": 7.999992322712075e-06,
      "loss": 0.0,
      "step": 180,
      "step_time": 6.581123827999818
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.4375,
      "completions/mean_terminated_length": 2.4375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.2230155989527702,
      "epoch": 0.00181,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 6.241205119295046e-05,
      "kl": 0.2899513263255358,
      "learning_rate": 7.999992215712882e-06,
      "loss": 0.0,
      "num_tokens": 4287392.0,
      "reward": 1.3875000476837158,
      "reward_std": 0.504016101360321,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.4375,
      "rewards/probe_completion_length/std": 0.504016101360321,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.687260389328003,
      "sampling/importance_sampling_ratio/mean": 0.8565247058868408,
      "sampling/importance_sampling_ratio/min": 0.0,
      "sampling/sampling_logp_difference/max": 1.2024495601654053,
      "sampling/sampling_logp_difference/mean": 0.1502533257007599,
      "step": 181,
      "step_time": 9.957160163000026
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.2253851145505905,
      "epoch": 0.00182,
      "grad_norm": 7.033378642518073e-05,
      "kl": 0.2906297470035497,
      "learning_rate": 7.999992107973214e-06,
      "loss": 0.0,
      "step": 182,
      "step_time": 6.031087762999505
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5,
      "completions/mean_terminated_length": 2.5,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9202814698219299,
      "epoch": 0.00183,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00013524244423024356,
      "kl": 0.29891258222050965,
      "learning_rate": 7.999991999493065e-06,
      "loss": 0.0,
      "num_tokens": 4334719.0,
      "reward": 1.4500000476837158,
      "reward_std": 0.5080004930496216,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5,
      "rewards/probe_completion_length/std": 0.5080004930496216,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.0915449857711792,
      "sampling/importance_sampling_ratio/mean": 0.8393715620040894,
      "sampling/importance_sampling_ratio/min": 0.21315446496009827,
      "sampling/sampling_logp_difference/max": 1.4369076490402222,
      "sampling/sampling_logp_difference/mean": 0.10889923572540283,
      "step": 183,
      "step_time": 10.379452743000002
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.9316936507821083,
      "epoch": 0.00184,
      "grad_norm": 0.0001468406117055565,
      "kl": 0.29929662309587,
      "learning_rate": 7.999991890272441e-06,
      "loss": 0.0,
      "step": 184,
      "step_time": 6.48263137099957
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.5239233821630478,
      "epoch": 0.00185,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0020477992948144674,
      "kl": 0.5273855591076426,
      "learning_rate": 7.999991780311339e-06,
      "loss": -0.0001,
      "num_tokens": 4379553.0,
      "reward": 3.231250047683716,
      "reward_std": 1.1139692068099976,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.3125,
      "rewards/probe_shaping_dominance/std": 0.4709290862083435,
      "rewards/probe_terminal_raw/mean": 0.6875,
      "rewards/probe_terminal_raw/std": 0.4709290862083435,
      "rewards/rollout_reward_func/mean": 0.375,
      "rewards/rollout_reward_func/std": 0.941858172416687,
      "sampling/importance_sampling_ratio/max": 1.0954153537750244,
      "sampling/importance_sampling_ratio/mean": 0.8618326783180237,
      "sampling/importance_sampling_ratio/min": 0.4107683002948761,
      "sampling/sampling_logp_difference/max": 0.7406651973724365,
      "sampling/sampling_logp_difference/mean": 0.06812501698732376,
      "step": 185,
      "step_time": 9.822880733999682
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.5153519753366709,
      "epoch": 0.00186,
      "grad_norm": 0.001283274032175541,
      "kl": 0.5346019868738949,
      "learning_rate": 7.999991669609758e-06,
      "loss": -0.0001,
      "step": 186,
      "step_time": 5.9687305269994795
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.71875,
      "completions/mean_terminated_length": 2.71875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.6474461331963539,
      "epoch": 0.00187,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 4.33456661994569e-05,
      "kl": 0.41244959738105536,
      "learning_rate": 7.999991558167702e-06,
      "loss": 0.0,
      "num_tokens": 4426775.0,
      "reward": 2.231250047683716,
      "reward_std": 1.0846248865127563,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.71875,
      "rewards/probe_completion_length/std": 0.45680341124534607,
      "rewards/probe_invalid_count/mean": 0.0625,
      "rewards/probe_invalid_count/std": 0.24593468010425568,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.5267152786254883,
      "sampling/importance_sampling_ratio/mean": 0.9380771517753601,
      "sampling/importance_sampling_ratio/min": 0.6180238127708435,
      "sampling/sampling_logp_difference/max": 0.5256781578063965,
      "sampling/sampling_logp_difference/mean": 0.06567574292421341,
      "step": 187,
      "step_time": 10.448041156000272
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.6464254632592201,
      "epoch": 0.00188,
      "grad_norm": 4.453281871974468e-05,
      "kl": 0.41222386225126684,
      "learning_rate": 7.999991445985168e-06,
      "loss": 0.0,
      "step": 188,
      "step_time": 6.039322745000391
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.40625,
      "completions/mean_terminated_length": 2.40625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.0263744220137596,
      "epoch": 0.00189,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 4.976044874638319e-05,
      "kl": 0.5344130313023925,
      "learning_rate": 7.999991333062156e-06,
      "loss": 0.0,
      "num_tokens": 4475338.0,
      "reward": 1.3875000476837158,
      "reward_std": 0.504016101360321,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.40625,
      "rewards/probe_completion_length/std": 0.49899089336395264,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.1248929500579834,
      "sampling/importance_sampling_ratio/mean": 0.8441773653030396,
      "sampling/importance_sampling_ratio/min": 0.3922726809978485,
      "sampling/sampling_logp_difference/max": 0.6367834806442261,
      "sampling/sampling_logp_difference/mean": 0.10147504508495331,
      "step": 189,
      "step_time": 10.450124996000795
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.0167877674102783,
      "epoch": 0.0019,
      "grad_norm": 5.3309948270907626e-05,
      "kl": 0.5326844826340675,
      "learning_rate": 7.999991219398665e-06,
      "loss": 0.0,
      "step": 190,
      "step_time": 6.544803051000599
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.84375,
      "completions/mean_terminated_length": 2.84375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.5253524780273438,
      "epoch": 0.00191,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0043292418122291565,
      "kl": 2.8481499888002872,
      "learning_rate": 7.999991104994699e-06,
      "loss": -0.0001,
      "num_tokens": 4520792.0,
      "reward": 3.231250047683716,
      "reward_std": 1.1976959705352783,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.84375,
      "rewards/probe_completion_length/std": 0.3689020276069641,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.28125,
      "rewards/probe_shaping_dominance/std": 0.45680341124534607,
      "rewards/probe_terminal_raw/mean": 0.71875,
      "rewards/probe_terminal_raw/std": 0.45680341124534607,
      "rewards/rollout_reward_func/mean": 0.4375,
      "rewards/rollout_reward_func/std": 0.9136068224906921,
      "sampling/importance_sampling_ratio/max": 1.7498235702514648,
      "sampling/importance_sampling_ratio/mean": 0.9441710114479065,
      "sampling/importance_sampling_ratio/min": 0.24626334011554718,
      "sampling/sampling_logp_difference/max": 1.2287273406982422,
      "sampling/sampling_logp_difference/mean": 0.05860981345176697,
      "step": 191,
      "step_time": 9.974904008000067
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.5309231318533421,
      "epoch": 0.00192,
      "grad_norm": 0.002293355530127883,
      "kl": 2.6455338932573795,
      "learning_rate": 7.999990989850255e-06,
      "loss": -0.0001,
      "step": 192,
      "step_time": 6.537967493999986
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.5067413002252579,
      "epoch": 0.00193,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.002911539049819112,
      "kl": 2.6948545714840293,
      "learning_rate": 7.999990873965335e-06,
      "loss": -0.0,
      "num_tokens": 4566659.0,
      "reward": 3.075000047683716,
      "reward_std": 1.008032202720642,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 2.0,
      "rewards/probe_completion_length/std": 0.0,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.4375,
      "rewards/probe_shaping_dominance/std": 0.504016101360321,
      "rewards/probe_terminal_raw/mean": 0.5625,
      "rewards/probe_terminal_raw/std": 0.504016101360321,
      "rewards/rollout_reward_func/mean": 0.125,
      "rewards/rollout_reward_func/std": 1.008032202720642,
      "sampling/importance_sampling_ratio/max": 1.6099581718444824,
      "sampling/importance_sampling_ratio/mean": 0.8238017559051514,
      "sampling/importance_sampling_ratio/min": 0.0,
      "sampling/sampling_logp_difference/max": 1.4553546905517578,
      "sampling/sampling_logp_difference/mean": 0.12532293796539307,
      "step": 193,
      "step_time": 10.576807229999758
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.5161035768687725,
      "epoch": 0.00194,
      "grad_norm": 0.0030419155955314636,
      "kl": 2.718808156438172,
      "learning_rate": 7.999990757339936e-06,
      "loss": -0.0,
      "step": 194,
      "step_time": 6.058262799999284
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.8125,
      "completions/mean_terminated_length": 2.8125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.5364239364862442,
      "epoch": 0.00195,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.003811279311776161,
      "kl": 0.7552666682749987,
      "learning_rate": 7.99999063997406e-06,
      "loss": -0.0,
      "num_tokens": 4616206.0,
      "reward": 2.918750047683716,
      "reward_std": 1.2309025526046753,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.8125,
      "rewards/probe_completion_length/std": 0.3965577781200409,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.4375,
      "rewards/probe_shaping_dominance/std": 0.504016101360321,
      "rewards/probe_terminal_raw/mean": 0.5625,
      "rewards/probe_terminal_raw/std": 0.504016101360321,
      "rewards/rollout_reward_func/mean": 0.125,
      "rewards/rollout_reward_func/std": 1.008032202720642,
      "sampling/importance_sampling_ratio/max": 1.210990071296692,
      "sampling/importance_sampling_ratio/mean": 0.9246749877929688,
      "sampling/importance_sampling_ratio/min": 0.5606986880302429,
      "sampling/sampling_logp_difference/max": 0.39641594886779785,
      "sampling/sampling_logp_difference/mean": 0.04741666093468666,
      "step": 195,
      "step_time": 10.604286133999722
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.012500000186264515,
      "entropy": 0.5519651547074318,
      "epoch": 0.00196,
      "grad_norm": 0.0031288517639040947,
      "kl": 0.757812971714884,
      "learning_rate": 7.999990521867708e-06,
      "loss": -0.0,
      "step": 196,
      "step_time": 6.057004269001027
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.59375,
      "completions/mean_terminated_length": 2.59375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9043734967708588,
      "epoch": 0.00197,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 5.1686216465895995e-05,
      "kl": 0.6568926060572267,
      "learning_rate": 7.999990403020877e-06,
      "loss": 0.0,
      "num_tokens": 4660047.0,
      "reward": 2.543750047683716,
      "reward_std": 1.4560080766677856,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.59375,
      "rewards/probe_completion_length/std": 0.49899089336395264,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 1.1121628284454346,
      "sampling/importance_sampling_ratio/mean": 0.8413404226303101,
      "sampling/importance_sampling_ratio/min": 0.46061256527900696,
      "sampling/sampling_logp_difference/max": 0.463133841753006,
      "sampling/sampling_logp_difference/mean": 0.08755563199520111,
      "step": 197,
      "step_time": 10.420948368001063
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.9122908115386963,
      "epoch": 0.00198,
      "grad_norm": 4.955950134899467e-05,
      "kl": 0.6581820687279105,
      "learning_rate": 7.99999028343357e-06,
      "loss": 0.0,
      "step": 198,
      "step_time": 5.955800602000636
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.53125,
      "completions/mean_terminated_length": 2.53125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.0404513776302338,
      "epoch": 0.00199,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0003683996037580073,
      "kl": 0.8756526485085487,
      "learning_rate": 7.999990163105786e-06,
      "loss": 0.0,
      "num_tokens": 4708237.0,
      "reward": 1.9812500476837158,
      "reward_std": 1.2309025526046753,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.53125,
      "rewards/probe_completion_length/std": 0.507007360458374,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 2.5098929405212402,
      "sampling/importance_sampling_ratio/mean": 0.9004417657852173,
      "sampling/importance_sampling_ratio/min": 0.1496535837650299,
      "sampling/sampling_logp_difference/max": 1.4193356037139893,
      "sampling/sampling_logp_difference/mean": 0.14671702682971954,
      "step": 199,
      "step_time": 10.53082898600087
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.051968514919281,
      "epoch": 0.002,
      "grad_norm": 0.00044793030247092247,
      "kl": 0.8906243778765202,
      "learning_rate": 7.999990042037526e-06,
      "loss": 0.0,
      "step": 200,
      "step_time": 6.516393788000187
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5625,
      "completions/mean_terminated_length": 2.5625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.1170097887516022,
      "epoch": 0.00201,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 6.0482478147605434e-05,
      "kl": 0.7796681722393259,
      "learning_rate": 7.999989920228787e-06,
      "loss": 0.0,
      "num_tokens": 4754978.0,
      "reward": 2.012500047683716,
      "reward_std": 1.2164862155914307,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5625,
      "rewards/probe_completion_length/std": 0.504016101360321,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.2457513809204102,
      "sampling/importance_sampling_ratio/mean": 0.8713302612304688,
      "sampling/importance_sampling_ratio/min": 0.3511073887348175,
      "sampling/sampling_logp_difference/max": 0.9598789215087891,
      "sampling/sampling_logp_difference/mean": 0.10739913582801819,
      "step": 201,
      "step_time": 9.898047209000197
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.1049348339438438,
      "epoch": 0.00202,
      "grad_norm": 9.396205859957263e-05,
      "kl": 0.7901200173655525,
      "learning_rate": 7.999989797679573e-06,
      "loss": 0.0,
      "step": 202,
      "step_time": 6.52449063100039
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.033333334140479565,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.033333334140479565,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5,
      "completions/mean_terminated_length": 2.5,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.0378850102424622,
      "epoch": 0.00203,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0005378347705118358,
      "kl": 0.2997244680300355,
      "learning_rate": 7.99998967438988e-06,
      "loss": -0.0001,
      "num_tokens": 4805189.0,
      "reward": 1.5750000476837158,
      "reward_std": 0.7931155562400818,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5,
      "rewards/probe_completion_length/std": 0.5080004930496216,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9375,
      "rewards/probe_shaping_dominance/std": 0.24593468010425568,
      "rewards/probe_terminal_raw/mean": 0.0625,
      "rewards/probe_terminal_raw/std": 0.24593468010425568,
      "rewards/rollout_reward_func/mean": -0.875,
      "rewards/rollout_reward_func/std": 0.49186936020851135,
      "sampling/importance_sampling_ratio/max": 1.14525306224823,
      "sampling/importance_sampling_ratio/mean": 0.758590817451477,
      "sampling/importance_sampling_ratio/min": 0.21433056890964508,
      "sampling/sampling_logp_difference/max": 1.4624667167663574,
      "sampling/sampling_logp_difference/mean": 0.15886078774929047,
      "step": 203,
      "step_time": 9.993277525999929
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.012500000186264515,
      "entropy": 1.058366522192955,
      "epoch": 0.00204,
      "grad_norm": 0.0009679613285697997,
      "kl": 0.3104899302124977,
      "learning_rate": 7.999989550359713e-06,
      "loss": -0.0001,
      "step": 204,
      "step_time": 6.569216364000113
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5625,
      "completions/mean_terminated_length": 2.5625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.0512825474143028,
      "epoch": 0.00205,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 8.101179992081597e-05,
      "kl": 0.15521670621819794,
      "learning_rate": 7.999989425589066e-06,
      "loss": 0.0,
      "num_tokens": 4852978.0,
      "reward": 2.012500047683716,
      "reward_std": 1.2164862155914307,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5625,
      "rewards/probe_completion_length/std": 0.504016101360321,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.1806155443191528,
      "sampling/importance_sampling_ratio/mean": 0.8150932192802429,
      "sampling/importance_sampling_ratio/min": 0.44537922739982605,
      "sampling/sampling_logp_difference/max": 0.5530131459236145,
      "sampling/sampling_logp_difference/mean": 0.11240445077419281,
      "step": 205,
      "step_time": 10.127766082000107
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.0513731241226196,
      "epoch": 0.00206,
      "grad_norm": 0.00010898512846324593,
      "kl": 0.16864246875047684,
      "learning_rate": 7.999989300077943e-06,
      "loss": 0.0,
      "step": 206,
      "step_time": 6.666827258999547
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.65625,
      "completions/mean_terminated_length": 2.65625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9041898250579834,
      "epoch": 0.00207,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006515422137454152,
      "kl": 1.24289670586586,
      "learning_rate": 7.999989173826344e-06,
      "loss": 0.0,
      "num_tokens": 4900008.0,
      "reward": 1.6687500476837158,
      "reward_std": 0.45680341124534607,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.65625,
      "rewards/probe_completion_length/std": 0.4825586974620819,
      "rewards/probe_invalid_count/mean": 0.0625,
      "rewards/probe_invalid_count/std": 0.24593468010425568,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 2.87306547164917,
      "sampling/importance_sampling_ratio/mean": 0.8855913281440735,
      "sampling/importance_sampling_ratio/min": 0.0,
      "sampling/sampling_logp_difference/max": 2.2157533168792725,
      "sampling/sampling_logp_difference/mean": 0.18908201158046722,
      "step": 207,
      "step_time": 9.946056094000596
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8971953243017197,
      "epoch": 0.00208,
      "grad_norm": 0.0004797672445420176,
      "kl": 1.1996175777167082,
      "learning_rate": 7.999989046834267e-06,
      "loss": 0.0,
      "step": 208,
      "step_time": 6.549594394000906
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 8.0,
      "completions/max_terminated_length": 8.0,
      "completions/mean_length": 2.59375,
      "completions/mean_terminated_length": 2.59375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9334496781229973,
      "epoch": 0.00209,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 2.8083361030439846e-05,
      "kl": 0.10746793961152434,
      "learning_rate": 7.999988919101714e-06,
      "loss": 0.0,
      "num_tokens": 4947000.0,
      "reward": 2.1062498092651367,
      "reward_std": 1.7799850702285767,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.65625,
      "rewards/probe_completion_length/std": 1.4280457496643066,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.4213379621505737,
      "sampling/importance_sampling_ratio/mean": 0.8671090006828308,
      "sampling/importance_sampling_ratio/min": 0.6377511024475098,
      "sampling/sampling_logp_difference/max": 0.518344521522522,
      "sampling/sampling_logp_difference/mean": 0.08076303452253342,
      "step": 209,
      "step_time": 9.98248077000062
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.9456727504730225,
      "epoch": 0.0021,
      "grad_norm": 3.3368298318237066e-05,
      "kl": 0.1082556719193235,
      "learning_rate": 7.999988790628685e-06,
      "loss": 0.0,
      "step": 210,
      "step_time": 6.5108358289990065
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.3125,
      "completions/mean_terminated_length": 2.3125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.2894764840602875,
      "epoch": 0.00211,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.01579872891306877,
      "kl": 1.3035051634069532,
      "learning_rate": 7.999988661415179e-06,
      "loss": 0.0,
      "num_tokens": 4996332.0,
      "reward": 1.2625000476837158,
      "reward_std": 0.4709290862083435,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.3125,
      "rewards/probe_completion_length/std": 0.4709290862083435,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.0238986015319824,
      "sampling/importance_sampling_ratio/mean": 0.7401514053344727,
      "sampling/importance_sampling_ratio/min": 0.20138022303581238,
      "sampling/sampling_logp_difference/max": 1.3580923080444336,
      "sampling/sampling_logp_difference/mean": 0.17557558417320251,
      "step": 211,
      "step_time": 10.527099645000362
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.2880031913518906,
      "epoch": 0.00212,
      "grad_norm": 0.0018287961138412356,
      "kl": 1.0675127455033362,
      "learning_rate": 7.999988531461196e-06,
      "loss": 0.0,
      "step": 212,
      "step_time": 6.03448314499974
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 110.0,
      "completions/max_terminated_length": 110.0,
      "completions/mean_length": 6.0625,
      "completions/mean_terminated_length": 6.0625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9502601698040962,
      "epoch": 0.00213,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.005205703433603048,
      "kl": 1.4420340545475483,
      "learning_rate": 7.999988400766736e-06,
      "loss": 0.0,
      "num_tokens": 5047044.0,
      "reward": 12.856250762939453,
      "reward_std": 58.7901496887207,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 12.09375,
      "rewards/probe_completion_length/std": 58.92715835571289,
      "rewards/probe_invalid_count/mean": 0.0625,
      "rewards/probe_invalid_count/std": 0.24593468010425568,
      "rewards/probe_shaping_dominance/mean": 0.625,
      "rewards/probe_shaping_dominance/std": 0.49186936020851135,
      "rewards/probe_terminal_raw/mean": 0.375,
      "rewards/probe_terminal_raw/std": 0.49186936020851135,
      "rewards/rollout_reward_func/mean": -0.25,
      "rewards/rollout_reward_func/std": 0.9837387204170227,
      "sampling/importance_sampling_ratio/max": 2.39088773727417,
      "sampling/importance_sampling_ratio/mean": 0.8575817942619324,
      "sampling/importance_sampling_ratio/min": 0.0,
      "sampling/sampling_logp_difference/max": 1.5442379713058472,
      "sampling/sampling_logp_difference/mean": 0.19435152411460876,
      "step": 213,
      "step_time": 11.199458510000568
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.02291666716337204,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02291666716337204,
      "entropy": 0.9396770298480988,
      "epoch": 0.00214,
      "grad_norm": 0.0025155353359878063,
      "kl": 1.3642384968698025,
      "learning_rate": 7.999988269331798e-06,
      "loss": -0.0,
      "step": 214,
      "step_time": 6.795581481000227
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.625,
      "completions/mean_terminated_length": 2.625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.7093618139624596,
      "epoch": 0.00215,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 4.147308936808258e-05,
      "kl": 0.8155974000692368,
      "learning_rate": 7.999988137156384e-06,
      "loss": 0.0,
      "num_tokens": 5092001.0,
      "reward": 2.575000047683716,
      "reward_std": 1.4312187433242798,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.625,
      "rewards/probe_completion_length/std": 0.49186936020851135,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 1.1610689163208008,
      "sampling/importance_sampling_ratio/mean": 0.9294631481170654,
      "sampling/importance_sampling_ratio/min": 0.6643704175949097,
      "sampling/sampling_logp_difference/max": 0.2963399291038513,
      "sampling/sampling_logp_difference/mean": 0.05530760437250137,
      "step": 215,
      "step_time": 10.349047988000166
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.7076744586229324,
      "epoch": 0.00216,
      "grad_norm": 2.461934491293505e-05,
      "kl": 0.8128750696778297,
      "learning_rate": 7.999988004240496e-06,
      "loss": 0.0,
      "step": 216,
      "step_time": 5.973635649000244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 7.0,
      "completions/max_terminated_length": 7.0,
      "completions/mean_length": 2.78125,
      "completions/mean_terminated_length": 2.78125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9698906242847443,
      "epoch": 0.00217,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0015630932757630944,
      "kl": 0.5165037027036306,
      "learning_rate": 7.99998787058413e-06,
      "loss": -0.0001,
      "num_tokens": 5138251.0,
      "reward": 2.481250047683716,
      "reward_std": 1.6260851621627808,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.84375,
      "rewards/probe_completion_length/std": 1.221035361289978,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.65625,
      "rewards/probe_shaping_dominance/std": 0.4825586974620819,
      "rewards/probe_terminal_raw/mean": 0.34375,
      "rewards/probe_terminal_raw/std": 0.4825586974620819,
      "rewards/rollout_reward_func/mean": -0.3125,
      "rewards/rollout_reward_func/std": 0.9651173949241638,
      "sampling/importance_sampling_ratio/max": 1.0233986377716064,
      "sampling/importance_sampling_ratio/mean": 0.8025634288787842,
      "sampling/importance_sampling_ratio/min": 0.2496911883354187,
      "sampling/sampling_logp_difference/max": 1.2798473834991455,
      "sampling/sampling_logp_difference/mean": 0.1027006059885025,
      "step": 217,
      "step_time": 9.982337998999355
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.9763199612498283,
      "epoch": 0.00218,
      "grad_norm": 0.001598120667040348,
      "kl": 0.5372840870841173,
      "learning_rate": 7.999987736187286e-06,
      "loss": -0.0001,
      "step": 218,
      "step_time": 6.55624300799991
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.84375,
      "completions/mean_terminated_length": 2.84375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.5885202661156654,
      "epoch": 0.00219,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0022829018998891115,
      "kl": 1.3220407664775848,
      "learning_rate": 7.999987601049967e-06,
      "loss": -0.0001,
      "num_tokens": 5188054.0,
      "reward": 3.168750047683716,
      "reward_std": 1.2110878229141235,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.84375,
      "rewards/probe_completion_length/std": 0.3689020276069641,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.3125,
      "rewards/probe_shaping_dominance/std": 0.4709290862083435,
      "rewards/probe_terminal_raw/mean": 0.6875,
      "rewards/probe_terminal_raw/std": 0.4709290862083435,
      "rewards/rollout_reward_func/mean": 0.375,
      "rewards/rollout_reward_func/std": 0.941858172416687,
      "sampling/importance_sampling_ratio/max": 1.3682610988616943,
      "sampling/importance_sampling_ratio/mean": 0.8985254764556885,
      "sampling/importance_sampling_ratio/min": 0.30704811215400696,
      "sampling/sampling_logp_difference/max": 1.0124280452728271,
      "sampling/sampling_logp_difference/mean": 0.07173188030719757,
      "step": 219,
      "step_time": 10.571298075000414
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.02083333395421505,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "entropy": 0.5978320278227329,
      "epoch": 0.0022,
      "grad_norm": 0.0007615772192366421,
      "kl": 1.3663252778351307,
      "learning_rate": 7.99998746517217e-06,
      "loss": -0.0001,
      "step": 220,
      "step_time": 6.068325715998981
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.65625,
      "completions/mean_terminated_length": 2.65625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.091103121638298,
      "epoch": 0.00221,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0003277143114246428,
      "kl": 0.8019103519618511,
      "learning_rate": 7.999987328553901e-06,
      "loss": 0.0,
      "num_tokens": 5236858.0,
      "reward": 1.6062500476837158,
      "reward_std": 0.4825586974620819,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.65625,
      "rewards/probe_completion_length/std": 0.4825586974620819,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.9474204778671265,
      "sampling/importance_sampling_ratio/mean": 0.8071853518486023,
      "sampling/importance_sampling_ratio/min": 0.23529821634292603,
      "sampling/sampling_logp_difference/max": 1.3313655853271484,
      "sampling/sampling_logp_difference/mean": 0.15911154448986053,
      "step": 221,
      "step_time": 10.23791552100056
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.089771643280983,
      "epoch": 0.00222,
      "grad_norm": 0.00040844481554813683,
      "kl": 0.8609168156981468,
      "learning_rate": 7.999987191195152e-06,
      "loss": 0.0,
      "step": 222,
      "step_time": 7.202484536999691
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5625,
      "completions/mean_terminated_length": 2.5625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9173014238476753,
      "epoch": 0.00223,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 7.766550697851926e-05,
      "kl": 1.2079013511538506,
      "learning_rate": 7.999987053095927e-06,
      "loss": 0.0,
      "num_tokens": 5288141.0,
      "reward": 2.012500047683716,
      "reward_std": 1.2164862155914307,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5625,
      "rewards/probe_completion_length/std": 0.504016101360321,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.8275642395019531,
      "sampling/importance_sampling_ratio/mean": 0.8998434543609619,
      "sampling/importance_sampling_ratio/min": 0.0,
      "sampling/sampling_logp_difference/max": 1.3255605697631836,
      "sampling/sampling_logp_difference/mean": 0.11937170475721359,
      "step": 223,
      "step_time": 10.373311099000148
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.9247178286314011,
      "epoch": 0.00224,
      "grad_norm": 7.279904093593359e-05,
      "kl": 1.2060724347829819,
      "learning_rate": 7.999986914256228e-06,
      "loss": 0.0,
      "step": 224,
      "step_time": 6.244458222000503
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.78125,
      "completions/mean_terminated_length": 2.78125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.7969616949558258,
      "epoch": 0.00225,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.0032166424207389355,
      "kl": 1.2331771329045296,
      "learning_rate": 7.99998677467605e-06,
      "loss": -0.0001,
      "num_tokens": 5329787.0,
      "reward": 2.543750047683716,
      "reward_std": 1.2406911849975586,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.78125,
      "rewards/probe_completion_length/std": 0.420013427734375,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.59375,
      "rewards/probe_shaping_dominance/std": 0.49899089336395264,
      "rewards/probe_terminal_raw/mean": 0.40625,
      "rewards/probe_terminal_raw/std": 0.49899089336395264,
      "rewards/rollout_reward_func/mean": -0.1875,
      "rewards/rollout_reward_func/std": 0.9979817867279053,
      "sampling/importance_sampling_ratio/max": 0.9643709063529968,
      "sampling/importance_sampling_ratio/mean": 0.7731209993362427,
      "sampling/importance_sampling_ratio/min": 0.25780153274536133,
      "sampling/sampling_logp_difference/max": 1.1979801654815674,
      "sampling/sampling_logp_difference/mean": 0.11933620274066925,
      "step": 225,
      "step_time": 10.426335748000838
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.7953629866242409,
      "epoch": 0.00226,
      "grad_norm": 0.0023331514094024897,
      "kl": 1.408123280853033,
      "learning_rate": 7.999986634355396e-06,
      "loss": -0.0001,
      "step": 226,
      "step_time": 6.432960845000707
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.8125,
      "completions/mean_terminated_length": 2.8125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.5347747877240181,
      "epoch": 0.00227,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 8.225801138905808e-05,
      "kl": 0.4508226178586483,
      "learning_rate": 7.999986493294267e-06,
      "loss": 0.0,
      "num_tokens": 5367612.0,
      "reward": 3.262500047683716,
      "reward_std": 1.2296733856201172,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.8125,
      "rewards/probe_completion_length/std": 0.3965577781200409,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.25,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.75,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": 0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.0039564371109009,
      "sampling/importance_sampling_ratio/mean": 0.9031103849411011,
      "sampling/importance_sampling_ratio/min": 0.511532723903656,
      "sampling/sampling_logp_difference/max": 0.46492481231689453,
      "sampling/sampling_logp_difference/mean": 0.04437258839607239,
      "step": 227,
      "step_time": 9.742115944000489
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.5368616580963135,
      "epoch": 0.00228,
      "grad_norm": 5.892190893064253e-05,
      "kl": 0.4490237596037332,
      "learning_rate": 7.99998635149266e-06,
      "loss": 0.0,
      "step": 228,
      "step_time": 5.865072176000012
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5625,
      "completions/mean_terminated_length": 2.5625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.0168335363268852,
      "epoch": 0.00229,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 3.180320345563814e-05,
      "kl": 0.3754228218458593,
      "learning_rate": 7.99998620895058e-06,
      "loss": 0.0,
      "num_tokens": 5417542.0,
      "reward": 2.012500047683716,
      "reward_std": 1.2164862155914307,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5625,
      "rewards/probe_completion_length/std": 0.504016101360321,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.1908074617385864,
      "sampling/importance_sampling_ratio/mean": 0.8601307272911072,
      "sampling/importance_sampling_ratio/min": 0.5950575470924377,
      "sampling/sampling_logp_difference/max": 0.333740234375,
      "sampling/sampling_logp_difference/mean": 0.08275139331817627,
      "step": 229,
      "step_time": 10.50062071100001
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.01875402033329,
      "epoch": 0.0023,
      "grad_norm": 3.042643038497772e-05,
      "kl": 0.3755720476619899,
      "learning_rate": 7.999986065668021e-06,
      "loss": 0.0,
      "step": 230,
      "step_time": 6.5292883610009085
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5,
      "completions/mean_terminated_length": 2.5,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.1066185683012009,
      "epoch": 0.00231,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006037249695509672,
      "kl": 0.5976095534861088,
      "learning_rate": 7.999985921644988e-06,
      "loss": 0.0,
      "num_tokens": 5464839.0,
      "reward": 1.9500000476837158,
      "reward_std": 1.2443419694900513,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5,
      "rewards/probe_completion_length/std": 0.5080004930496216,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.0169601440429688,
      "sampling/importance_sampling_ratio/mean": 0.7551096677780151,
      "sampling/importance_sampling_ratio/min": 0.1823217123746872,
      "sampling/sampling_logp_difference/max": 1.54960298538208,
      "sampling/sampling_logp_difference/mean": 0.14190775156021118,
      "step": 231,
      "step_time": 9.871105007000097
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.1074366867542267,
      "epoch": 0.00232,
      "grad_norm": 0.00048756369506008923,
      "kl": 0.5469358395785093,
      "learning_rate": 7.999985776881479e-06,
      "loss": 0.0,
      "step": 232,
      "step_time": 6.5496248439994815
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.625,
      "completions/mean_terminated_length": 2.625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8052812218666077,
      "epoch": 0.00233,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000179827562533319,
      "kl": 0.8202984808012843,
      "learning_rate": 7.999985631377493e-06,
      "loss": 0.0,
      "num_tokens": 5508160.0,
      "reward": 2.075000047683716,
      "reward_std": 1.1845782995224,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.625,
      "rewards/probe_completion_length/std": 0.49186936020851135,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 2.091932773590088,
      "sampling/importance_sampling_ratio/mean": 0.9047304391860962,
      "sampling/importance_sampling_ratio/min": 0.3661075830459595,
      "sampling/sampling_logp_difference/max": 0.8838050961494446,
      "sampling/sampling_logp_difference/mean": 0.12408536672592163,
      "step": 233,
      "step_time": 10.55139884099981
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8096337169408798,
      "epoch": 0.00234,
      "grad_norm": 0.00014511325571220368,
      "kl": 0.8095588530413806,
      "learning_rate": 7.99998548513303e-06,
      "loss": 0.0,
      "step": 234,
      "step_time": 6.070462433999637
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.53125,
      "completions/mean_terminated_length": 2.53125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9569649696350098,
      "epoch": 0.00235,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 9.280137601308525e-05,
      "kl": 0.7535966765135527,
      "learning_rate": 7.999985338148094e-06,
      "loss": 0.0,
      "num_tokens": 5553778.0,
      "reward": 1.9812500476837158,
      "reward_std": 1.2309025526046753,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.53125,
      "rewards/probe_completion_length/std": 0.507007360458374,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.3018815517425537,
      "sampling/importance_sampling_ratio/mean": 0.8392325639724731,
      "sampling/importance_sampling_ratio/min": 0.22106604278087616,
      "sampling/sampling_logp_difference/max": 0.8340189456939697,
      "sampling/sampling_logp_difference/mean": 0.11187630146741867,
      "step": 235,
      "step_time": 9.83470956599922
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.9559657499194145,
      "epoch": 0.00236,
      "grad_norm": 6.163664511404932e-05,
      "kl": 0.7435445003211498,
      "learning_rate": 7.99998519042268e-06,
      "loss": 0.0,
      "step": 236,
      "step_time": 6.452000556000257
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.28125,
      "completions/mean_terminated_length": 2.28125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.294971227645874,
      "epoch": 0.00237,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 4.2004070564871654e-05,
      "kl": 0.04399583343183622,
      "learning_rate": 7.99998504195679e-06,
      "loss": 0.0,
      "num_tokens": 5602651.0,
      "reward": 1.2312500476837158,
      "reward_std": 0.45680341124534607,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.28125,
      "rewards/probe_completion_length/std": 0.45680341124534607,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.0553680658340454,
      "sampling/importance_sampling_ratio/mean": 0.7829026579856873,
      "sampling/importance_sampling_ratio/min": 0.41982340812683105,
      "sampling/sampling_logp_difference/max": 0.5661625862121582,
      "sampling/sampling_logp_difference/mean": 0.12211545556783676,
      "step": 237,
      "step_time": 10.069317695000791
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.300062656402588,
      "epoch": 0.00238,
      "grad_norm": 3.1589595892000943e-05,
      "kl": 0.039544737548567355,
      "learning_rate": 7.999984892750425e-06,
      "loss": 0.0,
      "step": 238,
      "step_time": 6.534631725000509
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.65625,
      "completions/mean_terminated_length": 2.65625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9069351330399513,
      "epoch": 0.00239,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00010841053153853863,
      "kl": 0.9024822637438774,
      "learning_rate": 7.999984742803586e-06,
      "loss": 0.0,
      "num_tokens": 5651408.0,
      "reward": 2.106250047683716,
      "reward_std": 1.1670026779174805,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.65625,
      "rewards/probe_completion_length/std": 0.4825586974620819,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.0072801113128662,
      "sampling/importance_sampling_ratio/mean": 0.7536620497703552,
      "sampling/importance_sampling_ratio/min": 0.0,
      "sampling/sampling_logp_difference/max": 1.7853782176971436,
      "sampling/sampling_logp_difference/mean": 0.13082578778266907,
      "step": 239,
      "step_time": 10.10397600000033
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.9182785451412201,
      "epoch": 0.0024,
      "grad_norm": 0.00010150086745852605,
      "kl": 0.898620568215847,
      "learning_rate": 7.999984592116268e-06,
      "loss": 0.0,
      "step": 240,
      "step_time": 6.599010469999939
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.78125,
      "completions/mean_terminated_length": 2.78125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.791859045624733,
      "epoch": 0.00241,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001330450875684619,
      "kl": 0.6041602776385844,
      "learning_rate": 7.999984440688477e-06,
      "loss": 0.0,
      "num_tokens": 5695231.0,
      "reward": 2.762500047683716,
      "reward_std": 1.2556325197219849,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.78125,
      "rewards/probe_completion_length/std": 0.420013427734375,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 1.0298597812652588,
      "sampling/importance_sampling_ratio/mean": 0.7487383484840393,
      "sampling/importance_sampling_ratio/min": 0.0,
      "sampling/sampling_logp_difference/max": 1.494455337524414,
      "sampling/sampling_logp_difference/mean": 0.15715257823467255,
      "step": 241,
      "step_time": 9.925578503999986
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.7927238866686821,
      "epoch": 0.00242,
      "grad_norm": 0.0007401671609841287,
      "kl": 0.46895526768639684,
      "learning_rate": 7.999984288520209e-06,
      "loss": 0.0,
      "step": 242,
      "step_time": 5.904219341000498
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.59375,
      "completions/mean_terminated_length": 2.59375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8608187511563301,
      "epoch": 0.00243,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 3.881949669448659e-05,
      "kl": 0.20748848652510787,
      "learning_rate": 7.999984135611465e-06,
      "loss": 0.0,
      "num_tokens": 5739390.0,
      "reward": 2.543750047683716,
      "reward_std": 1.4560080766677856,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.59375,
      "rewards/probe_completion_length/std": 0.49899089336395264,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 1.1807578802108765,
      "sampling/importance_sampling_ratio/mean": 0.9090617895126343,
      "sampling/importance_sampling_ratio/min": 0.38658732175827026,
      "sampling/sampling_logp_difference/max": 0.6620534062385559,
      "sampling/sampling_logp_difference/mean": 0.07685399055480957,
      "step": 243,
      "step_time": 10.37353355400046
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8746257722377777,
      "epoch": 0.00244,
      "grad_norm": 4.032689321320504e-05,
      "kl": 0.20645966165466234,
      "learning_rate": 7.999983981962246e-06,
      "loss": 0.0,
      "step": 244,
      "step_time": 6.476325880000331
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 8.0,
      "completions/max_terminated_length": 8.0,
      "completions/mean_length": 2.375,
      "completions/mean_terminated_length": 2.375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.4716509729623795,
      "epoch": 0.00245,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 9.890943329082802e-05,
      "kl": 0.11096427217125893,
      "learning_rate": 7.999983827572551e-06,
      "loss": 0.0,
      "num_tokens": 5786717.0,
      "reward": 1.3875000476837158,
      "reward_std": 1.435438632965088,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.4375,
      "rewards/probe_completion_length/std": 1.4354385137557983,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.2329347133636475,
      "sampling/importance_sampling_ratio/mean": 0.7989503145217896,
      "sampling/importance_sampling_ratio/min": 0.3760721981525421,
      "sampling/sampling_logp_difference/max": 0.6286892890930176,
      "sampling/sampling_logp_difference/mean": 0.14064887166023254,
      "step": 245,
      "step_time": 9.838691988000392
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.481234386563301,
      "epoch": 0.00246,
      "grad_norm": 0.00011490120232338086,
      "kl": 0.1183417895808816,
      "learning_rate": 7.999983672442382e-06,
      "loss": 0.0,
      "step": 246,
      "step_time": 5.9876724660002765
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.71875,
      "completions/mean_terminated_length": 2.71875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9683929979801178,
      "epoch": 0.00247,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0005364255630411208,
      "kl": 1.5163855329155922,
      "learning_rate": 7.999983516571737e-06,
      "loss": -0.0,
      "num_tokens": 5834566.0,
      "reward": 2.606250047683716,
      "reward_std": 1.334634780883789,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.71875,
      "rewards/probe_completion_length/std": 0.45680341124534607,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.53125,
      "rewards/probe_shaping_dominance/std": 0.507007360458374,
      "rewards/probe_terminal_raw/mean": 0.46875,
      "rewards/probe_terminal_raw/std": 0.507007360458374,
      "rewards/rollout_reward_func/mean": -0.0625,
      "rewards/rollout_reward_func/std": 1.014014720916748,
      "sampling/importance_sampling_ratio/max": 1.0176223516464233,
      "sampling/importance_sampling_ratio/mean": 0.8093540668487549,
      "sampling/importance_sampling_ratio/min": 0.3403654098510742,
      "sampling/sampling_logp_difference/max": 0.8563747406005859,
      "sampling/sampling_logp_difference/mean": 0.11184030771255493,
      "step": 247,
      "step_time": 10.412933766999686
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.9714526236057281,
      "epoch": 0.00248,
      "grad_norm": 0.001109355129301548,
      "kl": 1.3958311006426811,
      "learning_rate": 7.999983359960615e-06,
      "loss": -0.0,
      "step": 248,
      "step_time": 6.410530790000394
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.4375,
      "completions/mean_terminated_length": 2.4375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.2722000256180763,
      "epoch": 0.00249,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0002374774921918288,
      "kl": 0.2879834126215428,
      "learning_rate": 7.999983202609019e-06,
      "loss": 0.0,
      "num_tokens": 5885261.0,
      "reward": 1.3875000476837158,
      "reward_std": 0.504016101360321,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.4375,
      "rewards/probe_completion_length/std": 0.504016101360321,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.276147484779358,
      "sampling/importance_sampling_ratio/mean": 0.7094168066978455,
      "sampling/importance_sampling_ratio/min": 0.21389688551425934,
      "sampling/sampling_logp_difference/max": 1.3515539169311523,
      "sampling/sampling_logp_difference/mean": 0.18311890959739685,
      "step": 249,
      "step_time": 10.01058061699996
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.271938718855381,
      "epoch": 0.0025,
      "grad_norm": 0.0001647109747864306,
      "kl": 0.30384280625730753,
      "learning_rate": 7.999983044516948e-06,
      "loss": 0.0,
      "step": 250,
      "step_time": 6.048756611999579
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.25,
      "completions/mean_terminated_length": 2.25,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.4384316056966782,
      "epoch": 0.00251,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 5.8330115280114114e-05,
      "kl": 0.07054687337949872,
      "learning_rate": 7.999982885684401e-06,
      "loss": 0.0,
      "num_tokens": 5931755.0,
      "reward": 1.2000000476837158,
      "reward_std": 0.4399413466453552,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.25,
      "rewards/probe_completion_length/std": 0.4399413466453552,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.0516539812088013,
      "sampling/importance_sampling_ratio/mean": 0.7808082103729248,
      "sampling/importance_sampling_ratio/min": 0.33390164375305176,
      "sampling/sampling_logp_difference/max": 0.6143656969070435,
      "sampling/sampling_logp_difference/mean": 0.13979840278625488,
      "step": 251,
      "step_time": 10.83073114400031
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.4351339936256409,
      "epoch": 0.00252,
      "grad_norm": 4.7220288251992315e-05,
      "kl": 0.06005410762736574,
      "learning_rate": 7.99998272611138e-06,
      "loss": 0.0,
      "step": 252,
      "step_time": 5.9603065350002
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 7.0,
      "completions/max_terminated_length": 7.0,
      "completions/mean_length": 2.625,
      "completions/mean_terminated_length": 2.625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.2312592715024948,
      "epoch": 0.00253,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005001653335057199,
      "kl": 0.42731712557724677,
      "learning_rate": 7.999982565797882e-06,
      "loss": 0.0,
      "num_tokens": 5983713.0,
      "reward": 1.6999999284744263,
      "reward_std": 1.2443419694900513,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.6875,
      "rewards/probe_completion_length/std": 1.2556324005126953,
      "rewards/probe_invalid_count/mean": 0.0625,
      "rewards/probe_invalid_count/std": 0.24593468010425568,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.4038161039352417,
      "sampling/importance_sampling_ratio/mean": 0.8100531697273254,
      "sampling/importance_sampling_ratio/min": 0.29241448640823364,
      "sampling/sampling_logp_difference/max": 1.844306468963623,
      "sampling/sampling_logp_difference/mean": 0.14470317959785461,
      "step": 253,
      "step_time": 10.147666580999612
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.2324032559990883,
      "epoch": 0.00254,
      "grad_norm": 0.0008306861855089664,
      "kl": 0.47986630973173305,
      "learning_rate": 7.999982404743908e-06,
      "loss": 0.0,
      "step": 254,
      "step_time": 6.12800545500113
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.7071742042899132,
      "epoch": 0.00255,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00011028751032426953,
      "kl": 1.5195309668779373,
      "learning_rate": 7.999982242949461e-06,
      "loss": 0.0,
      "num_tokens": 6031353.0,
      "reward": 2.325000047683716,
      "reward_std": 1.1845781803131104,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.7071067690849304,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.6566476821899414,
      "sampling/importance_sampling_ratio/mean": 0.869540810585022,
      "sampling/importance_sampling_ratio/min": 0.24219129979610443,
      "sampling/sampling_logp_difference/max": 0.9972515106201172,
      "sampling/sampling_logp_difference/mean": 0.09803014993667603,
      "step": 255,
      "step_time": 10.103539768000246
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.7113880515098572,
      "epoch": 0.00256,
      "grad_norm": 0.00011303301289444789,
      "kl": 1.5209232168272138,
      "learning_rate": 7.999982080414539e-06,
      "loss": 0.0,
      "step": 256,
      "step_time": 7.0594580389997645
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.78125,
      "completions/mean_terminated_length": 2.78125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.795683853328228,
      "epoch": 0.00257,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0001374085695715621,
      "kl": 0.541247180350183,
      "learning_rate": 7.999981917139141e-06,
      "loss": 0.0,
      "num_tokens": 6079938.0,
      "reward": 2.731250047683716,
      "reward_std": 1.2885193824768066,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.78125,
      "rewards/probe_completion_length/std": 0.420013427734375,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 1.2407677173614502,
      "sampling/importance_sampling_ratio/mean": 0.8522495031356812,
      "sampling/importance_sampling_ratio/min": 0.46599096059799194,
      "sampling/sampling_logp_difference/max": 0.549491286277771,
      "sampling/sampling_logp_difference/mean": 0.07480097562074661,
      "step": 257,
      "step_time": 10.19285303900051
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.7980589717626572,
      "epoch": 0.00258,
      "grad_norm": 0.0001367518270853907,
      "kl": 0.5404217857867479,
      "learning_rate": 7.999981753123268e-06,
      "loss": 0.0,
      "step": 258,
      "step_time": 6.162642582999524
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.8125,
      "completions/mean_terminated_length": 2.8125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.5748703218996525,
      "epoch": 0.00259,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 9.582992061041296e-05,
      "kl": 1.202084738528356,
      "learning_rate": 7.999981588366921e-06,
      "loss": 0.0,
      "num_tokens": 6122880.0,
      "reward": 2.762500047683716,
      "reward_std": 1.2556325197219849,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.8125,
      "rewards/probe_completion_length/std": 0.3965577781200409,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 1.2011796236038208,
      "sampling/importance_sampling_ratio/mean": 0.7956522703170776,
      "sampling/importance_sampling_ratio/min": 0.0,
      "sampling/sampling_logp_difference/max": 1.9468035697937012,
      "sampling/sampling_logp_difference/mean": 0.13236525654792786,
      "step": 259,
      "step_time": 9.850775819999853
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.5715639665722847,
      "epoch": 0.0026,
      "grad_norm": 9.380384290125221e-05,
      "kl": 1.196174212731421,
      "learning_rate": 7.999981422870099e-06,
      "loss": 0.0,
      "step": 260,
      "step_time": 6.849544752000384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.8125,
      "completions/mean_terminated_length": 2.8125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.5606408379971981,
      "epoch": 0.00261,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0020095854997634888,
      "kl": 0.7049203501082957,
      "learning_rate": 7.999981256632802e-06,
      "loss": -0.0001,
      "num_tokens": 6166733.0,
      "reward": 3.075000047683716,
      "reward_std": 1.263635277748108,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.8125,
      "rewards/probe_completion_length/std": 0.3965577781200409,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.34375,
      "rewards/probe_shaping_dominance/std": 0.4825586974620819,
      "rewards/probe_terminal_raw/mean": 0.65625,
      "rewards/probe_terminal_raw/std": 0.4825586974620819,
      "rewards/rollout_reward_func/mean": 0.3125,
      "rewards/rollout_reward_func/std": 0.9651173949241638,
      "sampling/importance_sampling_ratio/max": 1.6943975687026978,
      "sampling/importance_sampling_ratio/mean": 0.94061279296875,
      "sampling/importance_sampling_ratio/min": 0.6277596354484558,
      "sampling/sampling_logp_difference/max": 0.5732210874557495,
      "sampling/sampling_logp_difference/mean": 0.058817073702812195,
      "step": 261,
      "step_time": 9.924657725000088
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.5698106549680233,
      "epoch": 0.00262,
      "grad_norm": 0.0022092636208981276,
      "kl": 0.7005162222776562,
      "learning_rate": 7.999981089655028e-06,
      "loss": -0.0001,
      "step": 262,
      "step_time": 5.950440025000717
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.53125,
      "completions/mean_terminated_length": 2.53125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.0628485679626465,
      "epoch": 0.00263,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0022156224586069584,
      "kl": 1.1021189391613007,
      "learning_rate": 7.999980921936782e-06,
      "loss": -0.0001,
      "num_tokens": 6216735.0,
      "reward": 1.7625000476837158,
      "reward_std": 0.9651173949241638,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.53125,
      "rewards/probe_completion_length/std": 0.507007360458374,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.875,
      "rewards/probe_shaping_dominance/std": 0.33601075410842896,
      "rewards/probe_terminal_raw/mean": 0.125,
      "rewards/probe_terminal_raw/std": 0.33601075410842896,
      "rewards/rollout_reward_func/mean": -0.75,
      "rewards/rollout_reward_func/std": 0.6720215082168579,
      "sampling/importance_sampling_ratio/max": 1.2917293310165405,
      "sampling/importance_sampling_ratio/mean": 0.7865704298019409,
      "sampling/importance_sampling_ratio/min": 0.22512774169445038,
      "sampling/sampling_logp_difference/max": 1.3956575393676758,
      "sampling/sampling_logp_difference/mean": 0.15481485426425934,
      "step": 263,
      "step_time": 10.660381645999678
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.0550967678427696,
      "epoch": 0.00264,
      "grad_norm": 0.002029229188337922,
      "kl": 1.2448055073618889,
      "learning_rate": 7.999980753478058e-06,
      "loss": -0.0001,
      "step": 264,
      "step_time": 6.621740929000225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 7.0,
      "completions/max_terminated_length": 7.0,
      "completions/mean_length": 2.34375,
      "completions/mean_terminated_length": 2.34375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.3302894234657288,
      "epoch": 0.00265,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 4.390978574519977e-05,
      "kl": 0.07602045580279082,
      "learning_rate": 7.999980584278861e-06,
      "loss": 0.0,
      "num_tokens": 6266793.0,
      "reward": 1.3562500476837158,
      "reward_std": 1.2664244174957275,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.40625,
      "rewards/probe_completion_length/std": 1.266424298286438,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.5775710344314575,
      "sampling/importance_sampling_ratio/mean": 0.8080694079399109,
      "sampling/importance_sampling_ratio/min": 0.4113946259021759,
      "sampling/sampling_logp_difference/max": 0.6071059703826904,
      "sampling/sampling_logp_difference/mean": 0.13547483086585999,
      "step": 265,
      "step_time": 10.054258211000615
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.3213010132312775,
      "epoch": 0.00266,
      "grad_norm": 4.630487455870025e-05,
      "kl": 0.0754888579249382,
      "learning_rate": 7.999980414339192e-06,
      "loss": 0.0,
      "step": 266,
      "step_time": 6.096907744999498
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.78125,
      "completions/mean_terminated_length": 2.78125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.5773092843592167,
      "epoch": 0.00267,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0018685347167775035,
      "kl": 1.856453113257885,
      "learning_rate": 7.999980243659046e-06,
      "loss": -0.0,
      "num_tokens": 6315035.0,
      "reward": 3.168750047683716,
      "reward_std": 1.2885193824768066,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.78125,
      "rewards/probe_completion_length/std": 0.420013427734375,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.28125,
      "rewards/probe_shaping_dominance/std": 0.45680341124534607,
      "rewards/probe_terminal_raw/mean": 0.71875,
      "rewards/probe_terminal_raw/std": 0.45680341124534607,
      "rewards/rollout_reward_func/mean": 0.4375,
      "rewards/rollout_reward_func/std": 0.9136068224906921,
      "sampling/importance_sampling_ratio/max": 1.2453944683074951,
      "sampling/importance_sampling_ratio/mean": 0.9141055345535278,
      "sampling/importance_sampling_ratio/min": 0.3399631381034851,
      "sampling/sampling_logp_difference/max": 0.9761523008346558,
      "sampling/sampling_logp_difference/mean": 0.06499834358692169,
      "step": 267,
      "step_time": 9.843675738999991
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.5740438252687454,
      "epoch": 0.00268,
      "grad_norm": 0.0016455126460641623,
      "kl": 1.8581569492816925,
      "learning_rate": 7.999980072238424e-06,
      "loss": -0.0001,
      "step": 268,
      "step_time": 6.936017353999887
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.6875,
      "completions/mean_terminated_length": 2.6875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.7434339039027691,
      "epoch": 0.00269,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 4.207367601338774e-05,
      "kl": 0.4917968454919901,
      "learning_rate": 7.999979900077329e-06,
      "loss": 0.0,
      "num_tokens": 6363326.0,
      "reward": 2.637500047683716,
      "reward_std": 1.3781123161315918,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.6875,
      "rewards/probe_completion_length/std": 0.4709290862083435,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 1.243372917175293,
      "sampling/importance_sampling_ratio/mean": 0.9188703298568726,
      "sampling/importance_sampling_ratio/min": 0.5505338311195374,
      "sampling/sampling_logp_difference/max": 0.6038958430290222,
      "sampling/sampling_logp_difference/mean": 0.07901027053594589,
      "step": 269,
      "step_time": 9.94298315099968
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.7558643519878387,
      "epoch": 0.0027,
      "grad_norm": 4.379342863103375e-05,
      "kl": 0.48969965358264744,
      "learning_rate": 7.99997972717576e-06,
      "loss": 0.0,
      "step": 270,
      "step_time": 6.032589478999853
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.78125,
      "completions/mean_terminated_length": 2.78125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.6409019902348518,
      "epoch": 0.00271,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.020250845700502396,
      "kl": 5.129798948764801,
      "learning_rate": 7.999979553533716e-06,
      "loss": 0.0001,
      "num_tokens": 6407150.0,
      "reward": 2.762500047683716,
      "reward_std": 1.2556325197219849,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.78125,
      "rewards/probe_completion_length/std": 0.420013427734375,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 1.0568947792053223,
      "sampling/importance_sampling_ratio/mean": 0.7568554282188416,
      "sampling/importance_sampling_ratio/min": 0.0,
      "sampling/sampling_logp_difference/max": 3.129774332046509,
      "sampling/sampling_logp_difference/mean": 0.1845780611038208,
      "step": 271,
      "step_time": 10.397699974999341
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.6460685841739178,
      "epoch": 0.00272,
      "grad_norm": 0.001033518579788506,
      "kl": 1.4755041021853685,
      "learning_rate": 7.999979379151197e-06,
      "loss": 0.0,
      "step": 272,
      "step_time": 6.4973694099994646
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 8.0,
      "completions/max_terminated_length": 8.0,
      "completions/mean_length": 2.59375,
      "completions/mean_terminated_length": 2.59375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.0658483356237411,
      "epoch": 0.00273,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 9.214817691827193e-05,
      "kl": 0.6959873335435987,
      "learning_rate": 7.999979204028205e-06,
      "loss": 0.0,
      "num_tokens": 6453348.0,
      "reward": 2.075000047683716,
      "reward_std": 1.660742163658142,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.625,
      "rewards/probe_completion_length/std": 1.263635277748108,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.0120024681091309,
      "sampling/importance_sampling_ratio/mean": 0.8527206778526306,
      "sampling/importance_sampling_ratio/min": 0.5133639574050903,
      "sampling/sampling_logp_difference/max": 0.5644892454147339,
      "sampling/sampling_logp_difference/mean": 0.08288981765508652,
      "step": 273,
      "step_time": 9.938741726000444
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.0469078943133354,
      "epoch": 0.00274,
      "grad_norm": 0.00013808686344418675,
      "kl": 0.6965378280729055,
      "learning_rate": 7.999979028164737e-06,
      "loss": 0.0,
      "step": 274,
      "step_time": 6.027978434999113
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.8125,
      "completions/mean_terminated_length": 2.8125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.4376696143299341,
      "epoch": 0.00275,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 1.7920627215062268e-05,
      "kl": 1.6866260170936584,
      "learning_rate": 7.999978851560795e-06,
      "loss": 0.0,
      "num_tokens": 6502294.0,
      "reward": 3.262500047683716,
      "reward_std": 1.2296733856201172,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.8125,
      "rewards/probe_completion_length/std": 0.3965577781200409,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.25,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.75,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": 0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.3865877389907837,
      "sampling/importance_sampling_ratio/mean": 0.9549223184585571,
      "sampling/importance_sampling_ratio/min": 0.7004026770591736,
      "sampling/sampling_logp_difference/max": 0.33465075492858887,
      "sampling/sampling_logp_difference/mean": 0.03543006628751755,
      "step": 275,
      "step_time": 9.919397402000413
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.43271936662495136,
      "epoch": 0.00276,
      "grad_norm": 1.5806304872967303e-05,
      "kl": 1.6843811720609665,
      "learning_rate": 7.999978674216379e-06,
      "loss": 0.0,
      "step": 276,
      "step_time": 6.94891806600026
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.78125,
      "completions/mean_terminated_length": 2.78125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.586757205426693,
      "epoch": 0.00277,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00038270672666840255,
      "kl": 2.193971574306488,
      "learning_rate": 7.99997849613149e-06,
      "loss": 0.0,
      "num_tokens": 6549161.0,
      "reward": 1.7625000476837158,
      "reward_std": 0.3965577781200409,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.78125,
      "rewards/probe_completion_length/std": 0.420013427734375,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.461033821105957,
      "sampling/importance_sampling_ratio/mean": 0.9305300712585449,
      "sampling/importance_sampling_ratio/min": 0.20976446568965912,
      "sampling/sampling_logp_difference/max": 1.4805474281311035,
      "sampling/sampling_logp_difference/mean": 0.15049991011619568,
      "step": 277,
      "step_time": 10.054510989999471
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.5988273657858372,
      "epoch": 0.00278,
      "grad_norm": 0.00021034348173998296,
      "kl": 2.174222081899643,
      "learning_rate": 7.999978317306126e-06,
      "loss": 0.0,
      "step": 278,
      "step_time": 6.037595613000121
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 8.0,
      "completions/max_terminated_length": 8.0,
      "completions/mean_length": 2.625,
      "completions/mean_terminated_length": 2.625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9894053190946579,
      "epoch": 0.00279,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 7.872794958529994e-05,
      "kl": 0.7821534425020218,
      "learning_rate": 7.999978137740288e-06,
      "loss": 0.0,
      "num_tokens": 6597198.0,
      "reward": 2.1374998092651367,
      "reward_std": 1.7677668333053589,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.6875,
      "rewards/probe_completion_length/std": 1.4241578578948975,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.947765588760376,
      "sampling/importance_sampling_ratio/mean": 0.8623788356781006,
      "sampling/importance_sampling_ratio/min": 0.31989023089408875,
      "sampling/sampling_logp_difference/max": 0.7898147106170654,
      "sampling/sampling_logp_difference/mean": 0.11040355265140533,
      "step": 279,
      "step_time": 10.506534798000303
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.00410046428442,
      "epoch": 0.0028,
      "grad_norm": 5.26364310644567e-05,
      "kl": 0.7675415091216564,
      "learning_rate": 7.999977957433975e-06,
      "loss": 0.0,
      "step": 280,
      "step_time": 6.0566636830008065
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.3125,
      "completions/mean_terminated_length": 2.3125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.015853799879551,
      "epoch": 0.00281,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 5.767720358562656e-05,
      "kl": 0.06597586660063826,
      "learning_rate": 7.999977776387188e-06,
      "loss": 0.0,
      "num_tokens": 6641048.0,
      "reward": 1.7625000476837158,
      "reward_std": 1.3060035705566406,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.3125,
      "rewards/probe_completion_length/std": 0.4709290862083435,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 0.9529492855072021,
      "sampling/importance_sampling_ratio/mean": 0.8520058393478394,
      "sampling/importance_sampling_ratio/min": 0.516437292098999,
      "sampling/sampling_logp_difference/max": 0.35334205627441406,
      "sampling/sampling_logp_difference/mean": 0.07898004353046417,
      "step": 281,
      "step_time": 10.406129021000197
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.0222055688500404,
      "epoch": 0.00282,
      "grad_norm": 4.6576886234106496e-05,
      "kl": 0.06345673595205881,
      "learning_rate": 7.999977594599927e-06,
      "loss": 0.0,
      "step": 282,
      "step_time": 6.00192527199988
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.46875,
      "completions/mean_terminated_length": 2.46875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9479651227593422,
      "epoch": 0.00283,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 3.634697350207716e-05,
      "kl": 0.46159003078355454,
      "learning_rate": 7.999977412072193e-06,
      "loss": 0.0,
      "num_tokens": 6684990.0,
      "reward": 1.9187500476837158,
      "reward_std": 1.256836175918579,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.46875,
      "rewards/probe_completion_length/std": 0.507007360458374,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.1589446067810059,
      "sampling/importance_sampling_ratio/mean": 0.8635474443435669,
      "sampling/importance_sampling_ratio/min": 0.5721397995948792,
      "sampling/sampling_logp_difference/max": 0.4582357704639435,
      "sampling/sampling_logp_difference/mean": 0.08887867629528046,
      "step": 283,
      "step_time": 10.391508174999217
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.9357596710324287,
      "epoch": 0.00284,
      "grad_norm": 3.538501914590597e-05,
      "kl": 0.46022001723758876,
      "learning_rate": 7.999977228803984e-06,
      "loss": 0.0,
      "step": 284,
      "step_time": 5.972688897000353
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.71875,
      "completions/mean_terminated_length": 2.71875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8306822814047337,
      "epoch": 0.00285,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.003344170982018113,
      "kl": 1.2514954134821892,
      "learning_rate": 7.999977044795302e-06,
      "loss": 0.0,
      "num_tokens": 6730915.0,
      "reward": 2.106250047683716,
      "reward_std": 1.0809009075164795,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.71875,
      "rewards/probe_completion_length/std": 0.45680341124534607,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.78125,
      "rewards/probe_shaping_dominance/std": 0.420013427734375,
      "rewards/probe_terminal_raw/mean": 0.21875,
      "rewards/probe_terminal_raw/std": 0.420013427734375,
      "rewards/rollout_reward_func/mean": -0.5625,
      "rewards/rollout_reward_func/std": 0.84002685546875,
      "sampling/importance_sampling_ratio/max": 1.9693899154663086,
      "sampling/importance_sampling_ratio/mean": 0.9473754167556763,
      "sampling/importance_sampling_ratio/min": 0.4411812424659729,
      "sampling/sampling_logp_difference/max": 0.7108652591705322,
      "sampling/sampling_logp_difference/mean": 0.10070252418518066,
      "step": 285,
      "step_time": 10.50476976299933
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8169671483337879,
      "epoch": 0.00286,
      "grad_norm": 0.002965368563309312,
      "kl": 1.2458821088075638,
      "learning_rate": 7.999976860046145e-06,
      "loss": -0.0,
      "step": 286,
      "step_time": 6.519596415000251
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5,
      "completions/mean_terminated_length": 2.5,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9309164360165596,
      "epoch": 0.00287,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0011960415868088603,
      "kl": 1.297960703726858,
      "learning_rate": 7.999976674556518e-06,
      "loss": 0.0,
      "num_tokens": 6779611.0,
      "reward": 1.4500000476837158,
      "reward_std": 0.5080004930496216,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5,
      "rewards/probe_completion_length/std": 0.5080004930496216,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 2.1808512210845947,
      "sampling/importance_sampling_ratio/mean": 0.9273348450660706,
      "sampling/importance_sampling_ratio/min": 0.3587171137332916,
      "sampling/sampling_logp_difference/max": 1.1489166021347046,
      "sampling/sampling_logp_difference/mean": 0.13752076029777527,
      "step": 287,
      "step_time": 10.203225437000128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.9222960993647575,
      "epoch": 0.00288,
      "grad_norm": 0.0010410655522719026,
      "kl": 1.2635090134572238,
      "learning_rate": 7.999976488326414e-06,
      "loss": 0.0,
      "step": 288,
      "step_time": 6.195064060999812
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.28125,
      "completions/mean_terminated_length": 2.28125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.526524841785431,
      "epoch": 0.00289,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 8.704458741704002e-05,
      "kl": 0.1819338989444077,
      "learning_rate": 7.999976301355836e-06,
      "loss": 0.0,
      "num_tokens": 6830139.0,
      "reward": 1.2312500476837158,
      "reward_std": 0.45680341124534607,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.28125,
      "rewards/probe_completion_length/std": 0.45680341124534607,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.6263999938964844,
      "sampling/importance_sampling_ratio/mean": 0.797622799873352,
      "sampling/importance_sampling_ratio/min": 0.3925057351589203,
      "sampling/sampling_logp_difference/max": 0.7666944265365601,
      "sampling/sampling_logp_difference/mean": 0.1322537660598755,
      "step": 289,
      "step_time": 10.485197891000098
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.5026581287384033,
      "epoch": 0.0029,
      "grad_norm": 9.270982991438359e-05,
      "kl": 0.18431306723505259,
      "learning_rate": 7.999976113644787e-06,
      "loss": 0.0,
      "step": 290,
      "step_time": 6.567377654000666
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.78125,
      "completions/mean_terminated_length": 2.78125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.7679154798388481,
      "epoch": 0.00291,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0001906748948385939,
      "kl": 1.2085663825273514,
      "learning_rate": 7.999975925193261e-06,
      "loss": 0.0,
      "num_tokens": 6878792.0,
      "reward": 2.731250047683716,
      "reward_std": 1.2885193824768066,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.78125,
      "rewards/probe_completion_length/std": 0.420013427734375,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 1.0082029104232788,
      "sampling/importance_sampling_ratio/mean": 0.889763593673706,
      "sampling/importance_sampling_ratio/min": 0.6102039217948914,
      "sampling/sampling_logp_difference/max": 0.4290947914123535,
      "sampling/sampling_logp_difference/mean": 0.05696694180369377,
      "step": 291,
      "step_time": 10.025515152000025
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.755341375246644,
      "epoch": 0.00292,
      "grad_norm": 9.211798169417307e-05,
      "kl": 1.1880728974938393,
      "learning_rate": 7.999975736001263e-06,
      "loss": 0.0,
      "step": 292,
      "step_time": 6.049688758999764
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5,
      "completions/mean_terminated_length": 2.5,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.0296694338321686,
      "epoch": 0.00293,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004035626188851893,
      "kl": 0.7810260709375143,
      "learning_rate": 7.999975546068793e-06,
      "loss": 0.0,
      "num_tokens": 6929365.0,
      "reward": 1.4812500476837158,
      "reward_std": 0.507007360458374,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5,
      "rewards/probe_completion_length/std": 0.5080004930496216,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 2.1541950702667236,
      "sampling/importance_sampling_ratio/mean": 0.9323853254318237,
      "sampling/importance_sampling_ratio/min": 0.3904179334640503,
      "sampling/sampling_logp_difference/max": 0.9739227294921875,
      "sampling/sampling_logp_difference/mean": 0.1474728286266327,
      "step": 293,
      "step_time": 10.618863860999681
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.0283247902989388,
      "epoch": 0.00294,
      "grad_norm": 0.0003429663192946464,
      "kl": 0.720265094190836,
      "learning_rate": 7.999975355395847e-06,
      "loss": 0.0,
      "step": 294,
      "step_time": 6.128017419000116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5,
      "completions/mean_terminated_length": 2.5,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8973243683576584,
      "epoch": 0.00295,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.002437910996377468,
      "kl": 0.7489821650087833,
      "learning_rate": 7.999975163982429e-06,
      "loss": -0.0001,
      "num_tokens": 6978906.0,
      "reward": 2.200000047683716,
      "reward_std": 1.4142135381698608,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5,
      "rewards/probe_completion_length/std": 0.5080004930496216,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.625,
      "rewards/probe_shaping_dominance/std": 0.49186936020851135,
      "rewards/probe_terminal_raw/mean": 0.375,
      "rewards/probe_terminal_raw/std": 0.49186936020851135,
      "rewards/rollout_reward_func/mean": -0.25,
      "rewards/rollout_reward_func/std": 0.9837387204170227,
      "sampling/importance_sampling_ratio/max": 1.274229884147644,
      "sampling/importance_sampling_ratio/mean": 0.8626682162284851,
      "sampling/importance_sampling_ratio/min": 0.45520147681236267,
      "sampling/sampling_logp_difference/max": 0.44555020332336426,
      "sampling/sampling_logp_difference/mean": 0.09587711840867996,
      "step": 295,
      "step_time": 10.517878631000258
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8956578373908997,
      "epoch": 0.00296,
      "grad_norm": 0.0023497892543673515,
      "kl": 0.7520033298060298,
      "learning_rate": 7.999974971828538e-06,
      "loss": -0.0001,
      "step": 296,
      "step_time": 6.085854749000191
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.59375,
      "completions/mean_terminated_length": 2.59375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8559794425964355,
      "epoch": 0.00297,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006165934028103948,
      "kl": 1.2970306277275085,
      "learning_rate": 7.999974778934173e-06,
      "loss": 0.0,
      "num_tokens": 7022843.0,
      "reward": 2.075000047683716,
      "reward_std": 1.1845782995224,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.59375,
      "rewards/probe_completion_length/std": 0.49899089336395264,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 2.3478376865386963,
      "sampling/importance_sampling_ratio/mean": 0.9364867210388184,
      "sampling/importance_sampling_ratio/min": 0.2075859159231186,
      "sampling/sampling_logp_difference/max": 1.4937734603881836,
      "sampling/sampling_logp_difference/mean": 0.11856293678283691,
      "step": 297,
      "step_time": 10.372231002998888
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8517452776432037,
      "epoch": 0.00298,
      "grad_norm": 0.0012263483367860317,
      "kl": 1.4815890714526176,
      "learning_rate": 7.999974585299335e-06,
      "loss": 0.0,
      "step": 298,
      "step_time": 6.441055234999112
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5,
      "completions/mean_terminated_length": 2.5,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.1535784602165222,
      "epoch": 0.00299,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00010754216782515869,
      "kl": 0.1399077856913209,
      "learning_rate": 7.999974390924023e-06,
      "loss": 0.0,
      "num_tokens": 7074516.0,
      "reward": 1.4500000476837158,
      "reward_std": 0.5080004930496216,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5,
      "rewards/probe_completion_length/std": 0.5080004930496216,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 0.9696638584136963,
      "sampling/importance_sampling_ratio/mean": 0.8000353574752808,
      "sampling/importance_sampling_ratio/min": 0.4501090347766876,
      "sampling/sampling_logp_difference/max": 0.7076842188835144,
      "sampling/sampling_logp_difference/mean": 0.11227491497993469,
      "step": 299,
      "step_time": 10.031227658000262
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.1581177338957787,
      "epoch": 0.003,
      "grad_norm": 0.00011685074423439801,
      "kl": 0.1419827735517174,
      "learning_rate": 7.999974195808239e-06,
      "loss": 0.0,
      "step": 300,
      "step_time": 6.074399898000138
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.59375,
      "completions/mean_terminated_length": 2.59375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8323883041739464,
      "epoch": 0.00301,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0001984036498470232,
      "kl": 0.813261678442359,
      "learning_rate": 7.999973999951982e-06,
      "loss": 0.0,
      "num_tokens": 7125615.0,
      "reward": 2.043750047683716,
      "reward_std": 1.201058030128479,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.59375,
      "rewards/probe_completion_length/std": 0.49899089336395264,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.0074903964996338,
      "sampling/importance_sampling_ratio/mean": 0.8131163716316223,
      "sampling/importance_sampling_ratio/min": 0.4442375898361206,
      "sampling/sampling_logp_difference/max": 0.6795529127120972,
      "sampling/sampling_logp_difference/mean": 0.09874939173460007,
      "step": 301,
      "step_time": 11.23406893099991
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8443499356508255,
      "epoch": 0.00302,
      "grad_norm": 0.00015355952200479805,
      "kl": 0.8130504302680492,
      "learning_rate": 7.99997380335525e-06,
      "loss": 0.0,
      "step": 302,
      "step_time": 6.22487125599946
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.6875,
      "completions/mean_terminated_length": 2.6875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.6427240371704102,
      "epoch": 0.00303,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000308831047732383,
      "kl": 1.0533999186009169,
      "learning_rate": 7.999973606018048e-06,
      "loss": 0.0,
      "num_tokens": 7174736.0,
      "reward": 2.231250047683716,
      "reward_std": 1.0846248865127563,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.6875,
      "rewards/probe_completion_length/std": 0.4709290862083435,
      "rewards/probe_invalid_count/mean": 0.09375,
      "rewards/probe_invalid_count/std": 0.2961445748806,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.6255303621292114,
      "sampling/importance_sampling_ratio/mean": 0.8998515009880066,
      "sampling/importance_sampling_ratio/min": 0.24930788576602936,
      "sampling/sampling_logp_difference/max": 1.3098429441452026,
      "sampling/sampling_logp_difference/mean": 0.11029675602912903,
      "step": 303,
      "step_time": 9.959219268000197
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.6381810642778873,
      "epoch": 0.00304,
      "grad_norm": 0.00026818981859833,
      "kl": 1.0389648918062449,
      "learning_rate": 7.99997340794037e-06,
      "loss": 0.0,
      "step": 304,
      "step_time": 6.502279902000737
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.46875,
      "completions/mean_terminated_length": 2.46875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9642986953258514,
      "epoch": 0.00305,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00017586298054084182,
      "kl": 0.37446144194109365,
      "learning_rate": 7.999973209122222e-06,
      "loss": 0.0,
      "num_tokens": 7224003.0,
      "reward": 1.4187500476837158,
      "reward_std": 0.507007360458374,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.46875,
      "rewards/probe_completion_length/std": 0.507007360458374,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.28328537940979,
      "sampling/importance_sampling_ratio/mean": 0.8521016836166382,
      "sampling/importance_sampling_ratio/min": 0.4148902893066406,
      "sampling/sampling_logp_difference/max": 0.7877531051635742,
      "sampling/sampling_logp_difference/mean": 0.10069666802883148,
      "step": 305,
      "step_time": 10.471455122999942
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.9792665392160416,
      "epoch": 0.00306,
      "grad_norm": 0.00018103634647559375,
      "kl": 0.3658354175131535,
      "learning_rate": 7.999973009563599e-06,
      "loss": 0.0,
      "step": 306,
      "step_time": 6.058177401000194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.625,
      "completions/mean_terminated_length": 2.625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8190870024263859,
      "epoch": 0.00307,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.002435210859403014,
      "kl": 0.9355106446892023,
      "learning_rate": 7.999972809264505e-06,
      "loss": -0.0001,
      "num_tokens": 7271665.0,
      "reward": 2.293750047683716,
      "reward_std": 1.2853862047195435,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.625,
      "rewards/probe_completion_length/std": 0.49186936020851135,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.65625,
      "rewards/probe_shaping_dominance/std": 0.4825586974620819,
      "rewards/probe_terminal_raw/mean": 0.34375,
      "rewards/probe_terminal_raw/std": 0.4825586974620819,
      "rewards/rollout_reward_func/mean": -0.3125,
      "rewards/rollout_reward_func/std": 0.9651173949241638,
      "sampling/importance_sampling_ratio/max": 1.3618639707565308,
      "sampling/importance_sampling_ratio/mean": 0.8423999547958374,
      "sampling/importance_sampling_ratio/min": 0.15370477735996246,
      "sampling/sampling_logp_difference/max": 1.2758878469467163,
      "sampling/sampling_logp_difference/mean": 0.1437636762857437,
      "step": 307,
      "step_time": 10.07711297600008
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.8193341009318829,
      "epoch": 0.00308,
      "grad_norm": 0.002486324403434992,
      "kl": 0.9421810358762741,
      "learning_rate": 7.999972608224937e-06,
      "loss": -0.0001,
      "step": 308,
      "step_time": 6.035029341999234
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.375,
      "completions/mean_terminated_length": 2.375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.7972683534026146,
      "epoch": 0.00309,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 2.102801408909727e-05,
      "kl": 0.66733174957335,
      "learning_rate": 7.999972406444895e-06,
      "loss": 0.0,
      "num_tokens": 7324209.0,
      "reward": 1.3250000476837158,
      "reward_std": 0.49186936020851135,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.375,
      "rewards/probe_completion_length/std": 0.49186936020851135,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.0272186994552612,
      "sampling/importance_sampling_ratio/mean": 0.8698651790618896,
      "sampling/importance_sampling_ratio/min": 0.6035609245300293,
      "sampling/sampling_logp_difference/max": 0.34899115562438965,
      "sampling/sampling_logp_difference/mean": 0.06759645789861679,
      "step": 309,
      "step_time": 11.134619106000173
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8096836917102337,
      "epoch": 0.0031,
      "grad_norm": 1.849373620643746e-05,
      "kl": 0.6643700585700572,
      "learning_rate": 7.999972203924383e-06,
      "loss": 0.0,
      "step": 310,
      "step_time": 6.1093775609997465
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 8.0,
      "completions/max_terminated_length": 8.0,
      "completions/mean_length": 2.59375,
      "completions/mean_terminated_length": 2.59375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.1299720034003258,
      "epoch": 0.00311,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003075299086049199,
      "kl": 1.807578288950026,
      "learning_rate": 7.999972000663396e-06,
      "loss": 0.0,
      "num_tokens": 7369434.0,
      "reward": 1.5750000476837158,
      "reward_std": 1.263635277748108,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.625,
      "rewards/probe_completion_length/std": 1.263635277748108,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 2.1367149353027344,
      "sampling/importance_sampling_ratio/mean": 0.816227912902832,
      "sampling/importance_sampling_ratio/min": 0.17019084095954895,
      "sampling/sampling_logp_difference/max": 1.5584993362426758,
      "sampling/sampling_logp_difference/mean": 0.14971722662448883,
      "step": 311,
      "step_time": 9.711283419999745
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.1384611800312996,
      "epoch": 0.00312,
      "grad_norm": 0.001805527019314468,
      "kl": 1.2573999939486384,
      "learning_rate": 7.999971796661938e-06,
      "loss": 0.0,
      "step": 312,
      "step_time": 6.865537390999634
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.53125,
      "completions/mean_terminated_length": 2.53125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8577270582318306,
      "epoch": 0.00313,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0013959156349301338,
      "kl": 1.3633378157392144,
      "learning_rate": 7.999971591920007e-06,
      "loss": -0.0,
      "num_tokens": 7419095.0,
      "reward": 1.9187500476837158,
      "reward_std": 1.1773227453231812,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.53125,
      "rewards/probe_completion_length/std": 0.507007360458374,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.78125,
      "rewards/probe_shaping_dominance/std": 0.420013427734375,
      "rewards/probe_terminal_raw/mean": 0.21875,
      "rewards/probe_terminal_raw/std": 0.420013427734375,
      "rewards/rollout_reward_func/mean": -0.5625,
      "rewards/rollout_reward_func/std": 0.84002685546875,
      "sampling/importance_sampling_ratio/max": 1.8378033638000488,
      "sampling/importance_sampling_ratio/mean": 0.9367517232894897,
      "sampling/importance_sampling_ratio/min": 0.4363749027252197,
      "sampling/sampling_logp_difference/max": 0.8578743934631348,
      "sampling/sampling_logp_difference/mean": 0.12448535114526749,
      "step": 313,
      "step_time": 10.324483344001237
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8521794825792313,
      "epoch": 0.00314,
      "grad_norm": 0.0013546322006732225,
      "kl": 1.3631921615451574,
      "learning_rate": 7.999971386437603e-06,
      "loss": -0.0,
      "step": 314,
      "step_time": 6.133231628999511
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.21875,
      "completions/mean_terminated_length": 2.21875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.2760131061077118,
      "epoch": 0.00315,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 6.7212684371043e-05,
      "kl": 0.1668471217271872,
      "learning_rate": 7.999971180214728e-06,
      "loss": 0.0,
      "num_tokens": 7469149.0,
      "reward": 1.1687500476837158,
      "reward_std": 0.420013427734375,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.21875,
      "rewards/probe_completion_length/std": 0.420013427734375,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.3021711111068726,
      "sampling/importance_sampling_ratio/mean": 0.8215183019638062,
      "sampling/importance_sampling_ratio/min": 0.38338208198547363,
      "sampling/sampling_logp_difference/max": 0.8493318557739258,
      "sampling/sampling_logp_difference/mean": 0.13118374347686768,
      "step": 315,
      "step_time": 10.013548799000091
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.2984453290700912,
      "epoch": 0.00316,
      "grad_norm": 5.114684608997777e-05,
      "kl": 0.16854519414482638,
      "learning_rate": 7.99997097325138e-06,
      "loss": 0.0,
      "step": 316,
      "step_time": 7.113780104000853
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5625,
      "completions/mean_terminated_length": 2.5625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.0801144316792488,
      "epoch": 0.00317,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 8.528330363333225e-05,
      "kl": 0.07406117697246373,
      "learning_rate": 7.999970765547559e-06,
      "loss": 0.0,
      "num_tokens": 7517551.0,
      "reward": 2.012500047683716,
      "reward_std": 1.2164862155914307,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5625,
      "rewards/probe_completion_length/std": 0.504016101360321,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.1232194900512695,
      "sampling/importance_sampling_ratio/mean": 0.8244851231575012,
      "sampling/importance_sampling_ratio/min": 0.3947485089302063,
      "sampling/sampling_logp_difference/max": 0.4756275415420532,
      "sampling/sampling_logp_difference/mean": 0.10443481802940369,
      "step": 317,
      "step_time": 9.911432873999729
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.0896883979439735,
      "epoch": 0.00318,
      "grad_norm": 8.261961193056777e-05,
      "kl": 0.08147628034930676,
      "learning_rate": 7.999970557103267e-06,
      "loss": 0.0,
      "step": 318,
      "step_time": 6.029059536999739
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.34375,
      "completions/mean_terminated_length": 2.34375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.1932886689901352,
      "epoch": 0.00319,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00011847085261251777,
      "kl": 0.7054581567645073,
      "learning_rate": 7.999970347918501e-06,
      "loss": 0.0,
      "num_tokens": 7565287.0,
      "reward": 1.7937500476837158,
      "reward_std": 1.297873616218567,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.34375,
      "rewards/probe_completion_length/std": 0.4825586974620819,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.246423363685608,
      "sampling/importance_sampling_ratio/mean": 0.8327354192733765,
      "sampling/importance_sampling_ratio/min": 0.40081343054771423,
      "sampling/sampling_logp_difference/max": 0.558934211730957,
      "sampling/sampling_logp_difference/mean": 0.11713166534900665,
      "step": 319,
      "step_time": 9.851570212000752
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.194627471268177,
      "epoch": 0.0032,
      "grad_norm": 0.00012471656373236328,
      "kl": 0.6935346499085426,
      "learning_rate": 7.999970137993264e-06,
      "loss": 0.0,
      "step": 320,
      "step_time": 7.014572012999906
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.78125,
      "completions/mean_terminated_length": 2.78125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.5268306843936443,
      "epoch": 0.00321,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 1.4538780305883847e-05,
      "kl": 0.13023399263329338,
      "learning_rate": 7.999969927327556e-06,
      "loss": 0.0,
      "num_tokens": 7605921.0,
      "reward": 3.231250047683716,
      "reward_std": 1.2759405374526978,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.78125,
      "rewards/probe_completion_length/std": 0.420013427734375,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.25,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.75,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": 0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.2444671392440796,
      "sampling/importance_sampling_ratio/mean": 0.9207519292831421,
      "sampling/importance_sampling_ratio/min": 0.4403969347476959,
      "sampling/sampling_logp_difference/max": 0.6484390497207642,
      "sampling/sampling_logp_difference/mean": 0.043011896312236786,
      "step": 321,
      "step_time": 9.745958758999222
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.5186222679913044,
      "epoch": 0.00322,
      "grad_norm": 1.4087959243624937e-05,
      "kl": 0.13184529542922974,
      "learning_rate": 7.999969715921373e-06,
      "loss": 0.0,
      "step": 322,
      "step_time": 5.857027678000122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.45947896130383015,
      "epoch": 0.00323,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 3.156924503855407e-05,
      "kl": 1.6311049610376358,
      "learning_rate": 7.999969503774719e-06,
      "loss": 0.0,
      "num_tokens": 7647859.0,
      "reward": 3.325000047683716,
      "reward_std": 1.1288018226623535,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.25,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.75,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": 0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 0.995955228805542,
      "sampling/importance_sampling_ratio/mean": 0.9161585569381714,
      "sampling/importance_sampling_ratio/min": 0.4671463966369629,
      "sampling/sampling_logp_difference/max": 0.5933887958526611,
      "sampling/sampling_logp_difference/mean": 0.040462058037519455,
      "step": 323,
      "step_time": 10.309850834999452
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.4685290865600109,
      "epoch": 0.00324,
      "grad_norm": 7.684806769248098e-05,
      "kl": 1.6309725642204285,
      "learning_rate": 7.999969290887594e-06,
      "loss": 0.0,
      "step": 324,
      "step_time": 6.386313178999899
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 8.0,
      "completions/max_terminated_length": 8.0,
      "completions/mean_length": 2.375,
      "completions/mean_terminated_length": 2.375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.2863778471946716,
      "epoch": 0.00325,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00012449390487745404,
      "kl": 0.10761683306191117,
      "learning_rate": 7.999969077259998e-06,
      "loss": 0.0,
      "num_tokens": 7696966.0,
      "reward": 1.3875000476837158,
      "reward_std": 1.435438632965088,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.4375,
      "rewards/probe_completion_length/std": 1.4354385137557983,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.1330184936523438,
      "sampling/importance_sampling_ratio/mean": 0.7982755899429321,
      "sampling/importance_sampling_ratio/min": 0.30686163902282715,
      "sampling/sampling_logp_difference/max": 1.0647956132888794,
      "sampling/sampling_logp_difference/mean": 0.1156022697687149,
      "step": 325,
      "step_time": 9.99423157000001
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.289127841591835,
      "epoch": 0.00326,
      "grad_norm": 9.207100083585829e-05,
      "kl": 0.09634128154721111,
      "learning_rate": 7.999968862891929e-06,
      "loss": 0.0,
      "step": 326,
      "step_time": 6.051717358000587
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 8.0,
      "completions/max_terminated_length": 8.0,
      "completions/mean_length": 2.71875,
      "completions/mean_terminated_length": 2.71875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.097517840564251,
      "epoch": 0.00327,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0018292975146323442,
      "kl": 0.41003574151545763,
      "learning_rate": 7.999968647783389e-06,
      "loss": -0.0001,
      "num_tokens": 7743961.0,
      "reward": 1.9500000476837158,
      "reward_std": 1.4591203927993774,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.75,
      "rewards/probe_completion_length/std": 1.2443419694900513,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.875,
      "rewards/probe_shaping_dominance/std": 0.33601075410842896,
      "rewards/probe_terminal_raw/mean": 0.125,
      "rewards/probe_terminal_raw/std": 0.33601075410842896,
      "rewards/rollout_reward_func/mean": -0.75,
      "rewards/rollout_reward_func/std": 0.6720215082168579,
      "sampling/importance_sampling_ratio/max": 1.4463415145874023,
      "sampling/importance_sampling_ratio/mean": 0.7840434312820435,
      "sampling/importance_sampling_ratio/min": 0.21838130056858063,
      "sampling/sampling_logp_difference/max": 1.3481109142303467,
      "sampling/sampling_logp_difference/mean": 0.14009910821914673,
      "step": 327,
      "step_time": 10.658869628999582
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.0987923666834831,
      "epoch": 0.00328,
      "grad_norm": 0.0015403992729261518,
      "kl": 0.40840206295251846,
      "learning_rate": 7.999968431934376e-06,
      "loss": -0.0001,
      "step": 328,
      "step_time": 6.608717246999731
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.46875,
      "completions/mean_terminated_length": 2.46875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.3225805312395096,
      "epoch": 0.00329,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000299303384963423,
      "kl": 0.4579396164044738,
      "learning_rate": 7.999968215344892e-06,
      "loss": 0.0,
      "num_tokens": 7794251.0,
      "reward": 1.4187500476837158,
      "reward_std": 0.507007360458374,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.46875,
      "rewards/probe_completion_length/std": 0.507007360458374,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.700522541999817,
      "sampling/importance_sampling_ratio/mean": 0.7628653049468994,
      "sampling/importance_sampling_ratio/min": 0.2527129054069519,
      "sampling/sampling_logp_difference/max": 1.2428406476974487,
      "sampling/sampling_logp_difference/mean": 0.17843830585479736,
      "step": 329,
      "step_time": 10.212800845000402
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.32346860319376,
      "epoch": 0.0033,
      "grad_norm": 0.00023284112103283405,
      "kl": 0.41496767988428473,
      "learning_rate": 7.999967998014936e-06,
      "loss": 0.0,
      "step": 330,
      "step_time": 6.193185579000328
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.46875,
      "completions/mean_terminated_length": 2.46875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.0128037929534912,
      "epoch": 0.00331,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 5.9650203183991835e-05,
      "kl": 0.9195960871875286,
      "learning_rate": 7.999967779944508e-06,
      "loss": 0.0,
      "num_tokens": 7844694.0,
      "reward": 1.9187500476837158,
      "reward_std": 1.256836175918579,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.46875,
      "rewards/probe_completion_length/std": 0.507007360458374,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.2084201574325562,
      "sampling/importance_sampling_ratio/mean": 0.8935514688491821,
      "sampling/importance_sampling_ratio/min": 0.5958609580993652,
      "sampling/sampling_logp_difference/max": 0.49711179733276367,
      "sampling/sampling_logp_difference/mean": 0.08623553812503815,
      "step": 331,
      "step_time": 10.570847360000243
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.0223295539617538,
      "epoch": 0.00332,
      "grad_norm": 4.8721845814725384e-05,
      "kl": 0.9137447625398636,
      "learning_rate": 7.99996756113361e-06,
      "loss": 0.0,
      "step": 332,
      "step_time": 6.144921876000353
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5625,
      "completions/mean_terminated_length": 2.5625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.6741441264748573,
      "epoch": 0.00333,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 1.6808338841656223e-05,
      "kl": 0.7198268654756248,
      "learning_rate": 7.999967341582239e-06,
      "loss": 0.0,
      "num_tokens": 7890635.0,
      "reward": 2.512500047683716,
      "reward_std": 1.4797013998031616,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5625,
      "rewards/probe_completion_length/std": 0.504016101360321,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 0.9704124927520752,
      "sampling/importance_sampling_ratio/mean": 0.8803048133850098,
      "sampling/importance_sampling_ratio/min": 0.6561123728752136,
      "sampling/sampling_logp_difference/max": 0.5014957785606384,
      "sampling/sampling_logp_difference/mean": 0.05533091723918915,
      "step": 333,
      "step_time": 10.435302833999685
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.681053951382637,
      "epoch": 0.00334,
      "grad_norm": 2.799690446408931e-05,
      "kl": 0.7224257413763553,
      "learning_rate": 7.999967121290396e-06,
      "loss": 0.0,
      "step": 334,
      "step_time": 6.085340915999041
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.78125,
      "completions/mean_terminated_length": 2.78125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.7493787109851837,
      "epoch": 0.00335,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 8.788624836597592e-05,
      "kl": 0.3644955639792897,
      "learning_rate": 7.999966900258084e-06,
      "loss": 0.0,
      "num_tokens": 7935553.0,
      "reward": 2.731250047683716,
      "reward_std": 1.2885193824768066,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.78125,
      "rewards/probe_completion_length/std": 0.420013427734375,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 1.079957365989685,
      "sampling/importance_sampling_ratio/mean": 0.87494957447052,
      "sampling/importance_sampling_ratio/min": 0.37640491127967834,
      "sampling/sampling_logp_difference/max": 0.6546320915222168,
      "sampling/sampling_logp_difference/mean": 0.07020466029644012,
      "step": 335,
      "step_time": 10.833335889999944
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.7399843037128448,
      "epoch": 0.00336,
      "grad_norm": 4.674637602875009e-05,
      "kl": 0.3563638808336691,
      "learning_rate": 7.9999666784853e-06,
      "loss": 0.0,
      "step": 336,
      "step_time": 6.176430282999718
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.53125,
      "completions/mean_terminated_length": 2.53125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.2268180549144745,
      "epoch": 0.00337,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0032034514006227255,
      "kl": 0.92481086589396,
      "learning_rate": 7.999966455972044e-06,
      "loss": -0.0001,
      "num_tokens": 7985896.0,
      "reward": 1.8875000476837158,
      "reward_std": 1.1053389310836792,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.53125,
      "rewards/probe_completion_length/std": 0.507007360458374,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.8125,
      "rewards/probe_shaping_dominance/std": 0.3965577781200409,
      "rewards/probe_terminal_raw/mean": 0.1875,
      "rewards/probe_terminal_raw/std": 0.3965577781200409,
      "rewards/rollout_reward_func/mean": -0.625,
      "rewards/rollout_reward_func/std": 0.7931155562400818,
      "sampling/importance_sampling_ratio/max": 1.6343834400177002,
      "sampling/importance_sampling_ratio/mean": 0.828697919845581,
      "sampling/importance_sampling_ratio/min": 0.12531737983226776,
      "sampling/sampling_logp_difference/max": 1.9917540550231934,
      "sampling/sampling_logp_difference/mean": 0.15161240100860596,
      "step": 337,
      "step_time": 11.238573729999644
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.012500000186264515,
      "entropy": 1.2293306812644005,
      "epoch": 0.00338,
      "grad_norm": 0.0005875456845387816,
      "kl": 0.9996783956885338,
      "learning_rate": 7.999966232718316e-06,
      "loss": -0.0001,
      "step": 338,
      "step_time": 6.34771773500006
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.4375,
      "completions/mean_terminated_length": 2.4375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9276796728372574,
      "epoch": 0.00339,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 4.245446325512603e-05,
      "kl": 0.5849672869662754,
      "learning_rate": 7.999966008724119e-06,
      "loss": 0.0,
      "num_tokens": 8037066.0,
      "reward": 1.8875000476837158,
      "reward_std": 1.2684128284454346,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.4375,
      "rewards/probe_completion_length/std": 0.504016101360321,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.435063362121582,
      "sampling/importance_sampling_ratio/mean": 0.895423173904419,
      "sampling/importance_sampling_ratio/min": 0.6500217914581299,
      "sampling/sampling_logp_difference/max": 0.41063392162323,
      "sampling/sampling_logp_difference/mean": 0.08168335258960724,
      "step": 339,
      "step_time": 11.09809919000054
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.9489234611392021,
      "epoch": 0.0034,
      "grad_norm": 5.4233478294918314e-05,
      "kl": 0.5783893242478371,
      "learning_rate": 7.99996578398945e-06,
      "loss": 0.0,
      "step": 340,
      "step_time": 6.86810050899885
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.78125,
      "completions/mean_terminated_length": 2.78125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.6301294788718224,
      "epoch": 0.00341,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 3.1229348678607494e-05,
      "kl": 0.5702050784602761,
      "learning_rate": 7.99996555851431e-06,
      "loss": 0.0,
      "num_tokens": 8084129.0,
      "reward": 2.731250047683716,
      "reward_std": 1.2885193824768066,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.78125,
      "rewards/probe_completion_length/std": 0.420013427734375,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 1.1891618967056274,
      "sampling/importance_sampling_ratio/mean": 0.8978214263916016,
      "sampling/importance_sampling_ratio/min": 0.6014400124549866,
      "sampling/sampling_logp_difference/max": 0.3437986373901367,
      "sampling/sampling_logp_difference/mean": 0.05562354624271393,
      "step": 341,
      "step_time": 10.276972778999607
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.6289165541529655,
      "epoch": 0.00342,
      "grad_norm": 0.00010118436330230907,
      "kl": 0.5909891685005277,
      "learning_rate": 7.999965332298698e-06,
      "loss": 0.0,
      "step": 342,
      "step_time": 6.750384622999718
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.84375,
      "completions/mean_terminated_length": 2.84375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.680240910500288,
      "epoch": 0.00343,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 9.979560854844749e-05,
      "kl": 1.108576413244009,
      "learning_rate": 7.999965105342615e-06,
      "loss": 0.0,
      "num_tokens": 8130707.0,
      "reward": 2.793750047683716,
      "reward_std": 1.221035361289978,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.84375,
      "rewards/probe_completion_length/std": 0.3689020276069641,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 0.974475622177124,
      "sampling/importance_sampling_ratio/mean": 0.7939993143081665,
      "sampling/importance_sampling_ratio/min": 0.2770652174949646,
      "sampling/sampling_logp_difference/max": 0.9215399026870728,
      "sampling/sampling_logp_difference/mean": 0.10923172533512115,
      "step": 343,
      "step_time": 10.298724081999353
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.6709529925137758,
      "epoch": 0.00344,
      "grad_norm": 0.00011715908476617187,
      "kl": 1.1041308715939522,
      "learning_rate": 7.999964877646064e-06,
      "loss": 0.0,
      "step": 344,
      "step_time": 6.713455954999972
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5625,
      "completions/mean_terminated_length": 2.5625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.934910848736763,
      "epoch": 0.00345,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001362826325930655,
      "kl": 0.9546709419228137,
      "learning_rate": 7.99996464920904e-06,
      "loss": 0.0,
      "num_tokens": 8179015.0,
      "reward": 2.012500047683716,
      "reward_std": 1.2164862155914307,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5625,
      "rewards/probe_completion_length/std": 0.504016101360321,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.38411545753479,
      "sampling/importance_sampling_ratio/mean": 0.8699967861175537,
      "sampling/importance_sampling_ratio/min": 0.14834560453891754,
      "sampling/sampling_logp_difference/max": 1.8495113849639893,
      "sampling/sampling_logp_difference/mean": 0.11092771589756012,
      "step": 345,
      "step_time": 10.603762035999353
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.9600491523742676,
      "epoch": 0.00346,
      "grad_norm": 0.0016778368735685945,
      "kl": 1.0116195227019489,
      "learning_rate": 7.999964420031546e-06,
      "loss": 0.0,
      "step": 346,
      "step_time": 6.956354311000268
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.8125,
      "completions/mean_terminated_length": 2.8125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.5329532325267792,
      "epoch": 0.00347,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0027003376744687557,
      "kl": 1.3288137391209602,
      "learning_rate": 7.99996419011358e-06,
      "loss": -0.0001,
      "num_tokens": 8224667.0,
      "reward": 3.168750047683716,
      "reward_std": 1.2110878229141235,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.8125,
      "rewards/probe_completion_length/std": 0.3965577781200409,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.3125,
      "rewards/probe_shaping_dominance/std": 0.4709290862083435,
      "rewards/probe_terminal_raw/mean": 0.6875,
      "rewards/probe_terminal_raw/std": 0.4709290862083435,
      "rewards/rollout_reward_func/mean": 0.375,
      "rewards/rollout_reward_func/std": 0.941858172416687,
      "sampling/importance_sampling_ratio/max": 1.0700640678405762,
      "sampling/importance_sampling_ratio/mean": 0.8882616758346558,
      "sampling/importance_sampling_ratio/min": 0.14675666391849518,
      "sampling/sampling_logp_difference/max": 1.6794652938842773,
      "sampling/sampling_logp_difference/mean": 0.06773966550827026,
      "step": 347,
      "step_time": 10.328664960999504
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.012500000186264515,
      "entropy": 0.5438846126198769,
      "epoch": 0.00348,
      "grad_norm": 0.0006248099962249398,
      "kl": 1.3313154578208923,
      "learning_rate": 7.999963959455145e-06,
      "loss": -0.0001,
      "step": 348,
      "step_time": 6.798959563998778
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.59375,
      "completions/mean_terminated_length": 2.59375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8449730277061462,
      "epoch": 0.00349,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0066831219010055065,
      "kl": 1.3654840737581253,
      "learning_rate": 7.999963728056238e-06,
      "loss": -0.0,
      "num_tokens": 8270891.0,
      "reward": 2.012500047683716,
      "reward_std": 1.16224205493927,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.59375,
      "rewards/probe_completion_length/std": 0.49899089336395264,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.78125,
      "rewards/probe_shaping_dominance/std": 0.420013427734375,
      "rewards/probe_terminal_raw/mean": 0.21875,
      "rewards/probe_terminal_raw/std": 0.420013427734375,
      "rewards/rollout_reward_func/mean": -0.5625,
      "rewards/rollout_reward_func/std": 0.84002685546875,
      "sampling/importance_sampling_ratio/max": 1.4426002502441406,
      "sampling/importance_sampling_ratio/mean": 0.8822833299636841,
      "sampling/importance_sampling_ratio/min": 0.3499508798122406,
      "sampling/sampling_logp_difference/max": 1.0837316513061523,
      "sampling/sampling_logp_difference/mean": 0.0948411375284195,
      "step": 349,
      "step_time": 10.381334160999359
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.846816748380661,
      "epoch": 0.0035,
      "grad_norm": 0.0034231310710310936,
      "kl": 1.6555369943380356,
      "learning_rate": 7.99996349591686e-06,
      "loss": -0.0,
      "step": 350,
      "step_time": 6.824553742999797
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 9.0,
      "completions/max_terminated_length": 9.0,
      "completions/mean_length": 2.46875,
      "completions/mean_terminated_length": 2.46875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.4471484422683716,
      "epoch": 0.00351,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00028458540327847004,
      "kl": 0.40467185433954,
      "learning_rate": 7.999963263037014e-06,
      "loss": 0.0,
      "num_tokens": 8316234.0,
      "reward": 1.4812500476837158,
      "reward_std": 1.6061248779296875,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.53125,
      "rewards/probe_completion_length/std": 1.6061248779296875,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.0914900302886963,
      "sampling/importance_sampling_ratio/mean": 0.7615956664085388,
      "sampling/importance_sampling_ratio/min": 0.32088056206703186,
      "sampling/sampling_logp_difference/max": 0.9977098703384399,
      "sampling/sampling_logp_difference/mean": 0.14794917404651642,
      "step": 351,
      "step_time": 10.446612559999267
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.4311248511075974,
      "epoch": 0.00352,
      "grad_norm": 0.0003620213537942618,
      "kl": 0.4597238264977932,
      "learning_rate": 7.999963029416695e-06,
      "loss": 0.0,
      "step": 352,
      "step_time": 6.943062976000419
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.015625,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.015625,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 2.53125,
      "completions/mean_terminated_length": 2.53125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.0510557070374489,
      "epoch": 0.00353,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0010256932582706213,
      "kl": 1.2628090418875217,
      "learning_rate": 7.999962795055906e-06,
      "loss": -0.0001,
      "num_tokens": 8362968.0,
      "reward": 2.137500047683716,
      "reward_std": 1.533233880996704,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5,
      "rewards/probe_completion_length/std": 0.6720215082168579,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.65625,
      "rewards/probe_shaping_dominance/std": 0.4825586974620819,
      "rewards/probe_terminal_raw/mean": 0.34375,
      "rewards/probe_terminal_raw/std": 0.4825586974620819,
      "rewards/rollout_reward_func/mean": -0.3125,
      "rewards/rollout_reward_func/std": 0.9651173949241638,
      "sampling/importance_sampling_ratio/max": 1.4461380243301392,
      "sampling/importance_sampling_ratio/mean": 0.8395605087280273,
      "sampling/importance_sampling_ratio/min": 0.3192848265171051,
      "sampling/sampling_logp_difference/max": 1.0445311069488525,
      "sampling/sampling_logp_difference/mean": 0.13624471426010132,
      "step": 353,
      "step_time": 10.658984269000484
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.0730230584740639,
      "epoch": 0.00354,
      "grad_norm": 0.0011227383511140943,
      "kl": 1.2259647622704506,
      "learning_rate": 7.99996255995465e-06,
      "loss": -0.0001,
      "step": 354,
      "step_time": 6.383262848999493
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 7.0,
      "completions/max_terminated_length": 7.0,
      "completions/mean_length": 2.65625,
      "completions/mean_terminated_length": 2.65625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.0429575219750404,
      "epoch": 0.00355,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 4.014531441498548e-05,
      "kl": 0.059757897251984105,
      "learning_rate": 7.99996232411292e-06,
      "loss": 0.0,
      "num_tokens": 8408151.0,
      "reward": 2.168750047683716,
      "reward_std": 1.621118187904358,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.71875,
      "rewards/probe_completion_length/std": 1.2504031658172607,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 0.9531012773513794,
      "sampling/importance_sampling_ratio/mean": 0.827174961566925,
      "sampling/importance_sampling_ratio/min": 0.5229939222335815,
      "sampling/sampling_logp_difference/max": 0.48770320415496826,
      "sampling/sampling_logp_difference/mean": 0.0860244557261467,
      "step": 355,
      "step_time": 10.83670291099952
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.0519123449921608,
      "epoch": 0.00356,
      "grad_norm": 3.611388456192799e-05,
      "kl": 0.0671477972791763,
      "learning_rate": 7.999962087530722e-06,
      "loss": 0.0,
      "step": 356,
      "step_time": 6.801647416999458
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.625,
      "completions/mean_terminated_length": 2.625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.876899354159832,
      "epoch": 0.00357,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 4.189152241451666e-05,
      "kl": 0.5517611491377465,
      "learning_rate": 7.999961850208053e-06,
      "loss": 0.0,
      "num_tokens": 8453997.0,
      "reward": 2.575000047683716,
      "reward_std": 1.4312187433242798,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.625,
      "rewards/probe_completion_length/std": 0.49186936020851135,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 1.1144719123840332,
      "sampling/importance_sampling_ratio/mean": 0.8704279661178589,
      "sampling/importance_sampling_ratio/min": 0.2846354842185974,
      "sampling/sampling_logp_difference/max": 0.721602737903595,
      "sampling/sampling_logp_difference/mean": 0.09897022694349289,
      "step": 357,
      "step_time": 10.110818112000743
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8830019608139992,
      "epoch": 0.00358,
      "grad_norm": 4.22624361817725e-05,
      "kl": 0.5523047639580909,
      "learning_rate": 7.999961612144914e-06,
      "loss": 0.0,
      "step": 358,
      "step_time": 5.968289966999691
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.65625,
      "completions/mean_terminated_length": 2.65625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9959534220397472,
      "epoch": 0.00359,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0002864841080736369,
      "kl": 0.8776353914290667,
      "learning_rate": 7.999961373341304e-06,
      "loss": 0.0,
      "num_tokens": 8503781.0,
      "reward": 2.137500047683716,
      "reward_std": 1.1482806205749512,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.65625,
      "rewards/probe_completion_length/std": 0.4825586974620819,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 2.968733310699463,
      "sampling/importance_sampling_ratio/mean": 0.8962721824645996,
      "sampling/importance_sampling_ratio/min": 0.31244829297065735,
      "sampling/sampling_logp_difference/max": 1.1493885517120361,
      "sampling/sampling_logp_difference/mean": 0.12804019451141357,
      "step": 359,
      "step_time": 10.358135880999725
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.0032146386802197,
      "epoch": 0.0036,
      "grad_norm": 0.0002744219091255218,
      "kl": 0.8691087402403355,
      "learning_rate": 7.999961133797226e-06,
      "loss": 0.0,
      "step": 360,
      "step_time": 5.9020768589998625
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.84375,
      "completions/mean_terminated_length": 2.84375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.5623603612184525,
      "epoch": 0.00361,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 9.27875516936183e-05,
      "kl": 1.2790136635303497,
      "learning_rate": 7.999960893512676e-06,
      "loss": 0.0,
      "num_tokens": 8546449.0,
      "reward": 2.325000047683716,
      "reward_std": 1.008032202720642,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.84375,
      "rewards/probe_completion_length/std": 0.3689020276069641,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.3561346530914307,
      "sampling/importance_sampling_ratio/mean": 0.8485797643661499,
      "sampling/importance_sampling_ratio/min": 0.0,
      "sampling/sampling_logp_difference/max": 1.5407872200012207,
      "sampling/sampling_logp_difference/mean": 0.09200026839971542,
      "step": 361,
      "step_time": 10.702817508000408
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.5627776719629765,
      "epoch": 0.00362,
      "grad_norm": 8.150070789270103e-05,
      "kl": 1.2784964963793755,
      "learning_rate": 7.999960652487659e-06,
      "loss": 0.0,
      "step": 362,
      "step_time": 6.0848739280008886
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.46875,
      "completions/mean_terminated_length": 2.46875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.968632735311985,
      "epoch": 0.00363,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008145206957124174,
      "kl": 0.6598832843301352,
      "learning_rate": 7.99996041072217e-06,
      "loss": 0.0,
      "num_tokens": 8592159.0,
      "reward": 1.9187500476837158,
      "reward_std": 1.256836175918579,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.46875,
      "rewards/probe_completion_length/std": 0.507007360458374,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.374538540840149,
      "sampling/importance_sampling_ratio/mean": 0.8541547060012817,
      "sampling/importance_sampling_ratio/min": 0.17510434985160828,
      "sampling/sampling_logp_difference/max": 1.6452641487121582,
      "sampling/sampling_logp_difference/mean": 0.14628112316131592,
      "step": 363,
      "step_time": 10.549635234000107
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.9661797136068344,
      "epoch": 0.00364,
      "grad_norm": 0.0006351504125632346,
      "kl": 0.5736912706051953,
      "learning_rate": 7.999960168216212e-06,
      "loss": 0.0,
      "step": 364,
      "step_time": 6.1295828570000594
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.53125,
      "completions/mean_terminated_length": 2.53125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9562252089381218,
      "epoch": 0.00365,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.001311680767685175,
      "kl": 1.6579833626747131,
      "learning_rate": 7.999959924969784e-06,
      "loss": -0.0,
      "num_tokens": 8642179.0,
      "reward": 2.043750047683716,
      "reward_std": 1.2790968418121338,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.53125,
      "rewards/probe_completion_length/std": 0.507007360458374,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.71875,
      "rewards/probe_shaping_dominance/std": 0.45680341124534607,
      "rewards/probe_terminal_raw/mean": 0.28125,
      "rewards/probe_terminal_raw/std": 0.45680341124534607,
      "rewards/rollout_reward_func/mean": -0.4375,
      "rewards/rollout_reward_func/std": 0.9136068224906921,
      "sampling/importance_sampling_ratio/max": 1.3930572271347046,
      "sampling/importance_sampling_ratio/mean": 0.7577755451202393,
      "sampling/importance_sampling_ratio/min": 0.16704365611076355,
      "sampling/sampling_logp_difference/max": 1.0883684158325195,
      "sampling/sampling_logp_difference/mean": 0.18352536857128143,
      "step": 365,
      "step_time": 11.436037391999434
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.9706911072134972,
      "epoch": 0.00366,
      "grad_norm": 0.0013973768800497055,
      "kl": 1.7851417511701584,
      "learning_rate": 7.999959680982886e-06,
      "loss": -0.0,
      "step": 366,
      "step_time": 6.348716901999978
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.96875,
      "completions/mean_terminated_length": 2.96875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.535181537270546,
      "epoch": 0.00367,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.001643925323151052,
      "kl": 0.6322375678769276,
      "learning_rate": 7.99995943625552e-06,
      "loss": -0.0001,
      "num_tokens": 8685529.0,
      "reward": 3.293750047683716,
      "reward_std": 1.0035220384597778,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.96875,
      "rewards/probe_completion_length/std": 0.1767766922712326,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.3125,
      "rewards/probe_shaping_dominance/std": 0.4709290862083435,
      "rewards/probe_terminal_raw/mean": 0.6875,
      "rewards/probe_terminal_raw/std": 0.4709290862083435,
      "rewards/rollout_reward_func/mean": 0.375,
      "rewards/rollout_reward_func/std": 0.941858172416687,
      "sampling/importance_sampling_ratio/max": 1.264983892440796,
      "sampling/importance_sampling_ratio/mean": 0.8649332523345947,
      "sampling/importance_sampling_ratio/min": 0.20083816349506378,
      "sampling/sampling_logp_difference/max": 1.3320589065551758,
      "sampling/sampling_logp_difference/mean": 0.08798055350780487,
      "step": 367,
      "step_time": 10.462185017000138
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.5350116603076458,
      "epoch": 0.00368,
      "grad_norm": 0.0019252891652286053,
      "kl": 0.6243522961935923,
      "learning_rate": 7.999959190787684e-06,
      "loss": -0.0001,
      "step": 368,
      "step_time": 5.9952414070003215
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.65625,
      "completions/mean_terminated_length": 2.65625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.0593371242284775,
      "epoch": 0.00369,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0002375837357249111,
      "kl": 0.7576817069202662,
      "learning_rate": 7.999958944579377e-06,
      "loss": 0.0,
      "num_tokens": 8733838.0,
      "reward": 2.106250047683716,
      "reward_std": 1.1670026779174805,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.65625,
      "rewards/probe_completion_length/std": 0.4825586974620819,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 0.9666203856468201,
      "sampling/importance_sampling_ratio/mean": 0.7411911487579346,
      "sampling/importance_sampling_ratio/min": 0.0,
      "sampling/sampling_logp_difference/max": 0.8964768052101135,
      "sampling/sampling_logp_difference/mean": 0.13502906262874603,
      "step": 369,
      "step_time": 10.525359173999732
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.0495215579867363,
      "epoch": 0.0037,
      "grad_norm": 0.0002475433284416795,
      "kl": 0.7588968193158507,
      "learning_rate": 7.999958697630603e-06,
      "loss": 0.0,
      "step": 370,
      "step_time": 6.018097386999671
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.012500000186264515,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5,
      "completions/mean_terminated_length": 2.5,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9190730340778828,
      "epoch": 0.00371,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.001725654467009008,
      "kl": 1.4933142140507698,
      "learning_rate": 7.999958449941359e-06,
      "loss": -0.0001,
      "num_tokens": 8782213.0,
      "reward": 2.356250047683716,
      "reward_std": 1.4560080766677856,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5,
      "rewards/probe_completion_length/std": 0.5080004930496216,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.5625,
      "rewards/probe_shaping_dominance/std": 0.504016101360321,
      "rewards/probe_terminal_raw/mean": 0.4375,
      "rewards/probe_terminal_raw/std": 0.504016101360321,
      "rewards/rollout_reward_func/mean": -0.125,
      "rewards/rollout_reward_func/std": 1.008032202720642,
      "sampling/importance_sampling_ratio/max": 2.0596885681152344,
      "sampling/importance_sampling_ratio/mean": 0.850114107131958,
      "sampling/importance_sampling_ratio/min": 0.1749039888381958,
      "sampling/sampling_logp_difference/max": 2.175316333770752,
      "sampling/sampling_logp_difference/mean": 0.1391027569770813,
      "step": 371,
      "step_time": 10.462862956000663
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.9161196202039719,
      "epoch": 0.00372,
      "grad_norm": 0.0019953493028879166,
      "kl": 1.440907794982195,
      "learning_rate": 7.999958201511645e-06,
      "loss": -0.0001,
      "step": 372,
      "step_time": 6.057245024000167
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.30112432315945625,
      "epoch": 0.00373,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 4.154292128077941e-07,
      "kl": 0.9947994146496058,
      "learning_rate": 7.999957952341462e-06,
      "loss": 0.0,
      "num_tokens": 8826771.0,
      "reward": 3.950000047683716,
      "reward_std": 0.0,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 2.0,
      "rewards/probe_completion_length/std": 0.0,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 1.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": 1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 0.9597049951553345,
      "sampling/importance_sampling_ratio/mean": 0.9405255317687988,
      "sampling/importance_sampling_ratio/min": 0.9263921976089478,
      "sampling/sampling_logp_difference/max": 0.07755201309919357,
      "sampling/sampling_logp_difference/mean": 0.020689481869339943,
      "step": 373,
      "step_time": 9.649506622000445
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.2976001650094986,
      "epoch": 0.00374,
      "grad_norm": 3.7091297144797863e-07,
      "kl": 0.9948522942140698,
      "learning_rate": 7.99995770243081e-06,
      "loss": 0.0,
      "step": 374,
      "step_time": 5.476371095999639
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5625,
      "completions/mean_terminated_length": 2.5625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.179700344800949,
      "epoch": 0.00375,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00019490659178700298,
      "kl": 0.7850398123264313,
      "learning_rate": 7.999957451779688e-06,
      "loss": 0.0,
      "num_tokens": 8874930.0,
      "reward": 2.043750047683716,
      "reward_std": 1.201058030128479,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5625,
      "rewards/probe_completion_length/std": 0.504016101360321,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.468320369720459,
      "sampling/importance_sampling_ratio/mean": 0.819076657295227,
      "sampling/importance_sampling_ratio/min": 0.35633477568626404,
      "sampling/sampling_logp_difference/max": 0.8690546751022339,
      "sampling/sampling_logp_difference/mean": 0.14318270981311798,
      "step": 375,
      "step_time": 10.502751606999482
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.1702759563922882,
      "epoch": 0.00376,
      "grad_norm": 0.00013393217523116618,
      "kl": 0.81276436150074,
      "learning_rate": 7.9999572003881e-06,
      "loss": 0.0,
      "step": 376,
      "step_time": 6.032545493999351
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.65625,
      "completions/mean_terminated_length": 2.65625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.7177925407886505,
      "epoch": 0.00377,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 4.9635516916168854e-05,
      "kl": 1.2584875486791134,
      "learning_rate": 7.99995694825604e-06,
      "loss": 0.0,
      "num_tokens": 8918234.0,
      "reward": 2.606250047683716,
      "reward_std": 1.4052752256393433,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.65625,
      "rewards/probe_completion_length/std": 0.4825586974620819,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 1.3882752656936646,
      "sampling/importance_sampling_ratio/mean": 0.9102308750152588,
      "sampling/importance_sampling_ratio/min": 0.6210145354270935,
      "sampling/sampling_logp_difference/max": 0.4975299835205078,
      "sampling/sampling_logp_difference/mean": 0.060513950884342194,
      "step": 377,
      "step_time": 10.323000849000437
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.7326309494674206,
      "epoch": 0.00378,
      "grad_norm": 4.1395982407266274e-05,
      "kl": 1.2528069280087948,
      "learning_rate": 7.999956695383513e-06,
      "loss": 0.0,
      "step": 378,
      "step_time": 5.978144651000093
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 7.0,
      "completions/max_terminated_length": 7.0,
      "completions/mean_length": 2.5,
      "completions/mean_terminated_length": 2.5,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.4752974957227707,
      "epoch": 0.00379,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 7.717848347965628e-05,
      "kl": 0.4324369365349412,
      "learning_rate": 7.999956441770516e-06,
      "loss": 0.0,
      "num_tokens": 8965080.0,
      "reward": 1.5750000476837158,
      "reward_std": 1.338029384613037,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.59375,
      "rewards/probe_completion_length/std": 1.2916449308395386,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.9374521970748901,
      "sampling/importance_sampling_ratio/mean": 0.8539052605628967,
      "sampling/importance_sampling_ratio/min": 0.258626252412796,
      "sampling/sampling_logp_difference/max": 0.9988803863525391,
      "sampling/sampling_logp_difference/mean": 0.19715960323810577,
      "step": 379,
      "step_time": 10.432353575000889
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.4704746305942535,
      "epoch": 0.0038,
      "grad_norm": 0.00011949010513490066,
      "kl": 0.44164562225341797,
      "learning_rate": 7.999956187417052e-06,
      "loss": 0.0,
      "step": 380,
      "step_time": 6.4899310639998475
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.53125,
      "completions/mean_terminated_length": 2.53125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.111838586628437,
      "epoch": 0.00381,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00014719787577632815,
      "kl": 0.6005670647136867,
      "learning_rate": 7.999955932323117e-06,
      "loss": 0.0,
      "num_tokens": 9015299.0,
      "reward": 1.9812500476837158,
      "reward_std": 1.2309025526046753,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.53125,
      "rewards/probe_completion_length/std": 0.507007360458374,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.1842180490493774,
      "sampling/importance_sampling_ratio/mean": 0.839226484298706,
      "sampling/importance_sampling_ratio/min": 0.42126908898353577,
      "sampling/sampling_logp_difference/max": 0.5560178756713867,
      "sampling/sampling_logp_difference/mean": 0.10737180709838867,
      "step": 381,
      "step_time": 9.803125550000004
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.1229261308908463,
      "epoch": 0.00382,
      "grad_norm": 0.0001319620496360585,
      "kl": 0.5978384953923523,
      "learning_rate": 7.999955676488715e-06,
      "loss": 0.0,
      "step": 382,
      "step_time": 5.941128261000358
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.6875,
      "completions/mean_terminated_length": 2.6875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.0769974440336227,
      "epoch": 0.00383,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00012360778055153787,
      "kl": 0.4884014050476253,
      "learning_rate": 7.999955419913844e-06,
      "loss": 0.0,
      "num_tokens": 9061810.0,
      "reward": 2.137500047683716,
      "reward_std": 1.1482806205749512,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.6875,
      "rewards/probe_completion_length/std": 0.4709290862083435,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 0.9834169745445251,
      "sampling/importance_sampling_ratio/mean": 0.7719845771789551,
      "sampling/importance_sampling_ratio/min": 0.2768201529979706,
      "sampling/sampling_logp_difference/max": 1.0129671096801758,
      "sampling/sampling_logp_difference/mean": 0.1313181072473526,
      "step": 383,
      "step_time": 10.252918843000316
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.08458661288023,
      "epoch": 0.00384,
      "grad_norm": 0.00014907358854543418,
      "kl": 0.5057441159151495,
      "learning_rate": 7.999955162598504e-06,
      "loss": 0.0,
      "step": 384,
      "step_time": 7.167399140999805
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.84375,
      "completions/mean_terminated_length": 2.84375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.542071346193552,
      "epoch": 0.00385,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 3.077173460042104e-05,
      "kl": 1.9730855226516724,
      "learning_rate": 7.999954904542697e-06,
      "loss": 0.0,
      "num_tokens": 9106137.0,
      "reward": 3.293750047683716,
      "reward_std": 1.180742621421814,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.84375,
      "rewards/probe_completion_length/std": 0.3689020276069641,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.25,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.75,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": 0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.0693261623382568,
      "sampling/importance_sampling_ratio/mean": 0.9118248820304871,
      "sampling/importance_sampling_ratio/min": 0.41071459650993347,
      "sampling/sampling_logp_difference/max": 0.7213708162307739,
      "sampling/sampling_logp_difference/mean": 0.04966055974364281,
      "step": 385,
      "step_time": 9.818418909998854
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.5439686328172684,
      "epoch": 0.00386,
      "grad_norm": 2.3318221792578697e-05,
      "kl": 1.966349296271801,
      "learning_rate": 7.999954645746422e-06,
      "loss": 0.0,
      "step": 386,
      "step_time": 5.921195296999485
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4.0,
      "completions/max_terminated_length": 4.0,
      "completions/mean_length": 2.46875,
      "completions/mean_terminated_length": 2.46875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.3177487701177597,
      "epoch": 0.00387,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00017945896252058446,
      "kl": 0.47732366155833006,
      "learning_rate": 7.999954386209677e-06,
      "loss": 0.0,
      "num_tokens": 9154765.0,
      "reward": 1.4500000476837158,
      "reward_std": 0.6720215082168579,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.46875,
      "rewards/probe_completion_length/std": 0.5670737028121948,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.1261528730392456,
      "sampling/importance_sampling_ratio/mean": 0.7251231670379639,
      "sampling/importance_sampling_ratio/min": 0.0,
      "sampling/sampling_logp_difference/max": 1.286957859992981,
      "sampling/sampling_logp_difference/mean": 0.18756341934204102,
      "step": 387,
      "step_time": 11.11347648400033
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.313660904765129,
      "epoch": 0.00388,
      "grad_norm": 0.00017108296742662787,
      "kl": 0.4758234744949732,
      "learning_rate": 7.999954125932465e-06,
      "loss": 0.0,
      "step": 388,
      "step_time": 6.096910730000218
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.53125,
      "completions/mean_terminated_length": 2.53125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.7469111457467079,
      "epoch": 0.00389,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 2.495557964721229e-05,
      "kl": 0.037764198847071384,
      "learning_rate": 7.999953864914783e-06,
      "loss": 0.0,
      "num_tokens": 9194799.0,
      "reward": 2.481250047683716,
      "reward_std": 1.5023503303527832,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.53125,
      "rewards/probe_completion_length/std": 0.507007360458374,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 0.9957536458969116,
      "sampling/importance_sampling_ratio/mean": 0.8884516358375549,
      "sampling/importance_sampling_ratio/min": 0.6219885349273682,
      "sampling/sampling_logp_difference/max": 0.3152664601802826,
      "sampling/sampling_logp_difference/mean": 0.054477110505104065,
      "step": 389,
      "step_time": 9.634475558999384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.749498151242733,
      "epoch": 0.0039,
      "grad_norm": 2.757420043053571e-05,
      "kl": 0.037885259749600664,
      "learning_rate": 7.999953603156633e-06,
      "loss": 0.0,
      "step": 390,
      "step_time": 5.825650949000192
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 9.0,
      "completions/max_terminated_length": 9.0,
      "completions/mean_length": 2.6875,
      "completions/mean_terminated_length": 2.6875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.2955705225467682,
      "epoch": 0.00391,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00012024679745081812,
      "kl": 0.3169508697465062,
      "learning_rate": 7.999953340658018e-06,
      "loss": 0.0,
      "num_tokens": 9237578.0,
      "reward": 2.1999998092651367,
      "reward_std": 1.8837162256240845,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.75,
      "rewards/probe_completion_length/std": 1.5862311124801636,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.169108271598816,
      "sampling/importance_sampling_ratio/mean": 0.8330661058425903,
      "sampling/importance_sampling_ratio/min": 0.5545997023582458,
      "sampling/sampling_logp_difference/max": 0.614017903804779,
      "sampling/sampling_logp_difference/mean": 0.11316787451505661,
      "step": 391,
      "step_time": 10.490885173000152
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.303221918642521,
      "epoch": 0.00392,
      "grad_norm": 0.00011566941975615919,
      "kl": 0.31426724046468735,
      "learning_rate": 7.999953077418933e-06,
      "loss": 0.0,
      "step": 392,
      "step_time": 6.56834959400021
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.375,
      "completions/mean_terminated_length": 2.375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.460183471441269,
      "epoch": 0.00393,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00010010498954216018,
      "kl": 0.25794385001063347,
      "learning_rate": 7.99995281343938e-06,
      "loss": 0.0,
      "num_tokens": 9290253.0,
      "reward": 1.3250000476837158,
      "reward_std": 0.49186939001083374,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.375,
      "rewards/probe_completion_length/std": 0.49186936020851135,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.4612799882888794,
      "sampling/importance_sampling_ratio/mean": 0.8141236305236816,
      "sampling/importance_sampling_ratio/min": 0.5086011290550232,
      "sampling/sampling_logp_difference/max": 0.46626272797584534,
      "sampling/sampling_logp_difference/mean": 0.14415019750595093,
      "step": 393,
      "step_time": 10.023167942000327
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.4618552178144455,
      "epoch": 0.00394,
      "grad_norm": 9.292273171013221e-05,
      "kl": 0.24823577469214797,
      "learning_rate": 7.99995254871936e-06,
      "loss": 0.0,
      "step": 394,
      "step_time": 6.091329966000103
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.5105515196919441,
      "epoch": 0.00395,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.006318738218396902,
      "kl": 1.4926518842112273,
      "learning_rate": 7.999952283258871e-06,
      "loss": -0.0,
      "num_tokens": 9339783.0,
      "reward": 3.325000047683716,
      "reward_std": 1.0701221227645874,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.28125,
      "rewards/probe_shaping_dominance/std": 0.45680341124534607,
      "rewards/probe_terminal_raw/mean": 0.71875,
      "rewards/probe_terminal_raw/std": 0.45680341124534607,
      "rewards/rollout_reward_func/mean": 0.4375,
      "rewards/rollout_reward_func/std": 0.9136068224906921,
      "sampling/importance_sampling_ratio/max": 1.946527123451233,
      "sampling/importance_sampling_ratio/mean": 0.998159646987915,
      "sampling/importance_sampling_ratio/min": 0.5635565519332886,
      "sampling/sampling_logp_difference/max": 0.6543045043945312,
      "sampling/sampling_logp_difference/mean": 0.07140873372554779,
      "step": 395,
      "step_time": 10.683865035999588
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.012500000186264515,
      "entropy": 0.5092885829508305,
      "epoch": 0.00396,
      "grad_norm": 0.0005902937264181674,
      "kl": 1.4889302601804957,
      "learning_rate": 7.999952017057914e-06,
      "loss": -0.0001,
      "step": 396,
      "step_time": 6.604787075999866
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 8.0,
      "completions/max_terminated_length": 8.0,
      "completions/mean_length": 2.75,
      "completions/mean_terminated_length": 2.75,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8198611810803413,
      "epoch": 0.00397,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 6.475277041317895e-05,
      "kl": 0.7255684435367584,
      "learning_rate": 7.99995175011649e-06,
      "loss": 0.0,
      "num_tokens": 9383200.0,
      "reward": 2.7312498092651367,
      "reward_std": 1.73641037940979,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.78125,
      "rewards/probe_completion_length/std": 1.2374368906021118,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 1.1218085289001465,
      "sampling/importance_sampling_ratio/mean": 0.8827304840087891,
      "sampling/importance_sampling_ratio/min": 0.5214709639549255,
      "sampling/sampling_logp_difference/max": 0.4110407829284668,
      "sampling/sampling_logp_difference/mean": 0.07405085116624832,
      "step": 397,
      "step_time": 9.918148451999514
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8066371604800224,
      "epoch": 0.00398,
      "grad_norm": 6.86766070430167e-05,
      "kl": 0.7211511321365833,
      "learning_rate": 7.9999514824346e-06,
      "loss": 0.0,
      "step": 398,
      "step_time": 6.492818946999705
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.65625,
      "completions/mean_terminated_length": 2.65625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.6338056772947311,
      "epoch": 0.00399,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0021156487055122852,
      "kl": 2.406707100570202,
      "learning_rate": 7.999951214012241e-06,
      "loss": -0.0001,
      "num_tokens": 9432820.0,
      "reward": 2.481250047683716,
      "reward_std": 1.2947629690170288,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.65625,
      "rewards/probe_completion_length/std": 0.4825586974620819,
      "rewards/probe_invalid_count/mean": 0.0625,
      "rewards/probe_invalid_count/std": 0.24593468010425568,
      "rewards/probe_shaping_dominance/mean": 0.59375,
      "rewards/probe_shaping_dominance/std": 0.49899089336395264,
      "rewards/probe_terminal_raw/mean": 0.40625,
      "rewards/probe_terminal_raw/std": 0.49899089336395264,
      "rewards/rollout_reward_func/mean": -0.1875,
      "rewards/rollout_reward_func/std": 0.9979817867279053,
      "sampling/importance_sampling_ratio/max": 2.8168559074401855,
      "sampling/importance_sampling_ratio/mean": 0.9036719799041748,
      "sampling/importance_sampling_ratio/min": 0.24805206060409546,
      "sampling/sampling_logp_difference/max": 1.526820421218872,
      "sampling/sampling_logp_difference/mean": 0.1296796202659607,
      "step": 399,
      "step_time": 10.381847932000255
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.026041666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.026041666977107525,
      "entropy": 0.6298884674906731,
      "epoch": 0.004,
      "grad_norm": 0.003479708917438984,
      "kl": 2.6866486370563507,
      "learning_rate": 7.999950944849416e-06,
      "loss": -0.0001,
      "step": 400,
      "step_time": 6.760044167000615
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 8.0,
      "completions/max_terminated_length": 8.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.0361065343022346,
      "epoch": 0.00401,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 4.91031605633907e-05,
      "kl": 0.47253769740927964,
      "learning_rate": 7.999950674946121e-06,
      "loss": 0.0,
      "num_tokens": 9478722.0,
      "reward": 2.4187498092651367,
      "reward_std": 1.9997479915618896,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.96875,
      "rewards/probe_completion_length/std": 1.7867680788040161,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.11195707321167,
      "sampling/importance_sampling_ratio/mean": 0.844825029373169,
      "sampling/importance_sampling_ratio/min": 0.6442330479621887,
      "sampling/sampling_logp_difference/max": 0.2771162986755371,
      "sampling/sampling_logp_difference/mean": 0.07445387542247772,
      "step": 401,
      "step_time": 10.186995967000257
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.030918937176466,
      "epoch": 0.00402,
      "grad_norm": 3.9547725464217365e-05,
      "kl": 0.46755533292889595,
      "learning_rate": 7.99995040430236e-06,
      "loss": 0.0,
      "step": 402,
      "step_time": 6.485445361999609
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.53125,
      "completions/mean_terminated_length": 2.53125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.2354515939950943,
      "epoch": 0.00403,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 7.260632264660671e-05,
      "kl": 0.7972908793017268,
      "learning_rate": 7.999950132918132e-06,
      "loss": 0.0,
      "num_tokens": 9529957.0,
      "reward": 1.9812500476837158,
      "reward_std": 1.2309025526046753,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.53125,
      "rewards/probe_completion_length/std": 0.507007360458374,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.3521214723587036,
      "sampling/importance_sampling_ratio/mean": 0.8476842045783997,
      "sampling/importance_sampling_ratio/min": 0.43482327461242676,
      "sampling/sampling_logp_difference/max": 0.671419620513916,
      "sampling/sampling_logp_difference/mean": 0.12039721012115479,
      "step": 403,
      "step_time": 10.79022957999996
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.2416410520672798,
      "epoch": 0.00404,
      "grad_norm": 6.640212814090773e-05,
      "kl": 0.8009253116324544,
      "learning_rate": 7.999949860793436e-06,
      "loss": 0.0,
      "step": 404,
      "step_time": 6.121063306999531
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.40625,
      "completions/mean_terminated_length": 2.40625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.044917955994606,
      "epoch": 0.00405,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 4.647417881642468e-05,
      "kl": 0.5192130440846086,
      "learning_rate": 7.999949587928276e-06,
      "loss": 0.0,
      "num_tokens": 9577996.0,
      "reward": 1.8562500476837158,
      "reward_std": 1.2790968418121338,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.40625,
      "rewards/probe_completion_length/std": 0.49899089336395264,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.008124828338623,
      "sampling/importance_sampling_ratio/mean": 0.868937075138092,
      "sampling/importance_sampling_ratio/min": 0.5071920156478882,
      "sampling/sampling_logp_difference/max": 0.4627190828323364,
      "sampling/sampling_logp_difference/mean": 0.07959282398223877,
      "step": 405,
      "step_time": 10.07913560499992
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.054791770875454,
      "epoch": 0.00406,
      "grad_norm": 4.776316927745938e-05,
      "kl": 0.5234422187786549,
      "learning_rate": 7.999949314322646e-06,
      "loss": 0.0,
      "step": 406,
      "step_time": 6.5588358670011075
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.40625,
      "completions/mean_terminated_length": 2.40625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.2371145486831665,
      "epoch": 0.00407,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00010027150710811839,
      "kl": 0.8442691564559937,
      "learning_rate": 7.999949039976548e-06,
      "loss": 0.0,
      "num_tokens": 9626057.0,
      "reward": 1.3562500476837158,
      "reward_std": 0.498990923166275,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.40625,
      "rewards/probe_completion_length/std": 0.49899089336395264,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.8078845739364624,
      "sampling/importance_sampling_ratio/mean": 0.8824479579925537,
      "sampling/importance_sampling_ratio/min": 0.4061150550842285,
      "sampling/sampling_logp_difference/max": 0.8688634037971497,
      "sampling/sampling_logp_difference/mean": 0.1303093582391739,
      "step": 407,
      "step_time": 9.787462137999682
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.231427051126957,
      "epoch": 0.00408,
      "grad_norm": 8.325048111146316e-05,
      "kl": 0.8341128025203943,
      "learning_rate": 7.999948764889987e-06,
      "loss": 0.0,
      "step": 408,
      "step_time": 6.426104604999637
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.75,
      "completions/mean_terminated_length": 2.75,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.5369663275778294,
      "epoch": 0.00409,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 2.07056673389161e-05,
      "kl": 1.0741528943181038,
      "learning_rate": 7.999948489062955e-06,
      "loss": 0.0,
      "num_tokens": 9666963.0,
      "reward": 2.700000047683716,
      "reward_std": 1.319823980331421,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.75,
      "rewards/probe_completion_length/std": 0.4399413466453552,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 1.2189736366271973,
      "sampling/importance_sampling_ratio/mean": 0.9475693702697754,
      "sampling/importance_sampling_ratio/min": 0.6639173030853271,
      "sampling/sampling_logp_difference/max": 0.38153910636901855,
      "sampling/sampling_logp_difference/mean": 0.044145338237285614,
      "step": 409,
      "step_time": 9.87107176499967
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.5364666283130646,
      "epoch": 0.0041,
      "grad_norm": 2.00495524040889e-05,
      "kl": 1.0751704201102257,
      "learning_rate": 7.99994821249546e-06,
      "loss": 0.0,
      "step": 410,
      "step_time": 6.38307224800019
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.375,
      "completions/mean_terminated_length": 2.375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.059143915772438,
      "epoch": 0.00411,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 4.802541297976859e-05,
      "kl": 0.8314541056752205,
      "learning_rate": 7.999947935187496e-06,
      "loss": 0.0,
      "num_tokens": 9717854.0,
      "reward": 1.8250000476837158,
      "reward_std": 1.2889105081558228,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.375,
      "rewards/probe_completion_length/std": 0.49186936020851135,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.2114936113357544,
      "sampling/importance_sampling_ratio/mean": 0.8912780284881592,
      "sampling/importance_sampling_ratio/min": 0.5582963824272156,
      "sampling/sampling_logp_difference/max": 0.5062055587768555,
      "sampling/sampling_logp_difference/mean": 0.10437913984060287,
      "step": 411,
      "step_time": 9.990031543999521
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.0532640889286995,
      "epoch": 0.00412,
      "grad_norm": 4.7814017307246104e-05,
      "kl": 0.8273545559495687,
      "learning_rate": 7.999947657139067e-06,
      "loss": 0.0,
      "step": 412,
      "step_time": 6.57262487800017
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.46875,
      "completions/mean_terminated_length": 2.46875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.2248899713158607,
      "epoch": 0.00413,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 6.309987657004967e-05,
      "kl": 0.09391478716861457,
      "learning_rate": 7.99994737835017e-06,
      "loss": 0.0,
      "num_tokens": 9764095.0,
      "reward": 1.9187500476837158,
      "reward_std": 1.256836175918579,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.46875,
      "rewards/probe_completion_length/std": 0.507007360458374,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.1762675046920776,
      "sampling/importance_sampling_ratio/mean": 0.8028596639633179,
      "sampling/importance_sampling_ratio/min": 0.4906952977180481,
      "sampling/sampling_logp_difference/max": 0.42553746700286865,
      "sampling/sampling_logp_difference/mean": 0.11625593155622482,
      "step": 413,
      "step_time": 9.962155038000674
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.2161956802010536,
      "epoch": 0.00414,
      "grad_norm": 5.9610069001792e-05,
      "kl": 0.09792823957832297,
      "learning_rate": 7.999947098820806e-06,
      "loss": 0.0,
      "step": 414,
      "step_time": 6.5541509539998515
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.8125,
      "completions/mean_terminated_length": 2.8125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.5474282167851925,
      "epoch": 0.00415,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 7.157572690630332e-05,
      "kl": 1.5889837145805359,
      "learning_rate": 7.999946818550977e-06,
      "loss": 0.0,
      "num_tokens": 9810805.0,
      "reward": 2.262500047683716,
      "reward_std": 1.0606601238250732,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.8125,
      "rewards/probe_completion_length/std": 0.3965577781200409,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.550018310546875,
      "sampling/importance_sampling_ratio/mean": 0.966625452041626,
      "sampling/importance_sampling_ratio/min": 0.3172820806503296,
      "sampling/sampling_logp_difference/max": 1.058587670326233,
      "sampling/sampling_logp_difference/mean": 0.06538467109203339,
      "step": 415,
      "step_time": 10.29786402399941
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.5420740880072117,
      "epoch": 0.00416,
      "grad_norm": 7.944254321046174e-05,
      "kl": 1.5934259369969368,
      "learning_rate": 7.99994653754068e-06,
      "loss": 0.0,
      "step": 416,
      "step_time": 5.919829822999418
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.34375,
      "completions/mean_terminated_length": 2.34375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.3573927953839302,
      "epoch": 0.00417,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 5.489736213348806e-05,
      "kl": 0.4752029323717579,
      "learning_rate": 7.999946255789918e-06,
      "loss": 0.0,
      "num_tokens": 9858587.0,
      "reward": 1.2937500476837158,
      "reward_std": 0.4825587272644043,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.34375,
      "rewards/probe_completion_length/std": 0.4825586974620819,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.2800050973892212,
      "sampling/importance_sampling_ratio/mean": 0.776732325553894,
      "sampling/importance_sampling_ratio/min": 0.0,
      "sampling/sampling_logp_difference/max": 1.6876821517944336,
      "sampling/sampling_logp_difference/mean": 0.16653087735176086,
      "step": 417,
      "step_time": 10.134290047998547
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.338614284992218,
      "epoch": 0.00418,
      "grad_norm": 8.523750875610858e-05,
      "kl": 0.48709472338669,
      "learning_rate": 7.99994597329869e-06,
      "loss": 0.0,
      "step": 418,
      "step_time": 6.640404371999466
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.59375,
      "completions/mean_terminated_length": 2.59375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8219548910856247,
      "epoch": 0.00419,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 2.453891465847846e-05,
      "kl": 0.6859103422611952,
      "learning_rate": 7.999945690066996e-06,
      "loss": 0.0,
      "num_tokens": 9903235.0,
      "reward": 2.543750047683716,
      "reward_std": 1.4560080766677856,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.59375,
      "rewards/probe_completion_length/std": 0.49899089336395264,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 0.9690582752227783,
      "sampling/importance_sampling_ratio/mean": 0.8696466088294983,
      "sampling/importance_sampling_ratio/min": 0.528867781162262,
      "sampling/sampling_logp_difference/max": 0.45124274492263794,
      "sampling/sampling_logp_difference/mean": 0.07117698341608047,
      "step": 419,
      "step_time": 9.838608534000741
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8214314952492714,
      "epoch": 0.0042,
      "grad_norm": 3.332663618493825e-05,
      "kl": 0.6894950913265347,
      "learning_rate": 7.999945406094835e-06,
      "loss": 0.0,
      "step": 420,
      "step_time": 6.389929472001313
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.65625,
      "completions/mean_terminated_length": 2.65625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8140515983104706,
      "epoch": 0.00421,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 1.6370302546420135e-05,
      "kl": 0.49950144518516026,
      "learning_rate": 7.999945121382207e-06,
      "loss": 0.0,
      "num_tokens": 9951581.0,
      "reward": 2.606250047683716,
      "reward_std": 1.4052752256393433,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.65625,
      "rewards/probe_completion_length/std": 0.4825586974620819,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 1.0606497526168823,
      "sampling/importance_sampling_ratio/mean": 0.8690915107727051,
      "sampling/importance_sampling_ratio/min": 0.5077239274978638,
      "sampling/sampling_logp_difference/max": 0.46195554733276367,
      "sampling/sampling_logp_difference/mean": 0.07101378589868546,
      "step": 421,
      "step_time": 10.472032985000624
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.7896543815732002,
      "epoch": 0.00422,
      "grad_norm": 9.226971997122746e-06,
      "kl": 0.49870440473023336,
      "learning_rate": 7.999944835929116e-06,
      "loss": 0.0,
      "step": 422,
      "step_time": 6.057825643999422
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.53125,
      "completions/mean_terminated_length": 2.53125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.1121821850538254,
      "epoch": 0.00423,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004775114357471466,
      "kl": 1.3636024817824364,
      "learning_rate": 7.999944549735557e-06,
      "loss": 0.0,
      "num_tokens": 10000036.0,
      "reward": 1.9812500476837158,
      "reward_std": 1.2309025526046753,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.53125,
      "rewards/probe_completion_length/std": 0.507007360458374,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.3543111085891724,
      "sampling/importance_sampling_ratio/mean": 0.8356016874313354,
      "sampling/importance_sampling_ratio/min": 0.18780741095542908,
      "sampling/sampling_logp_difference/max": 1.3883202075958252,
      "sampling/sampling_logp_difference/mean": 0.15372709929943085,
      "step": 423,
      "step_time": 9.897093682999184
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.121554672718048,
      "epoch": 0.00424,
      "grad_norm": 0.00040099461330100894,
      "kl": 1.33827693015337,
      "learning_rate": 7.999944262801533e-06,
      "loss": 0.0,
      "step": 424,
      "step_time": 6.476370014000167
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.75,
      "completions/mean_terminated_length": 2.75,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.602243434637785,
      "epoch": 0.00425,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 5.09244782733731e-05,
      "kl": 1.7651392817497253,
      "learning_rate": 7.999943975127043e-06,
      "loss": 0.0,
      "num_tokens": 10046052.0,
      "reward": 2.731250047683716,
      "reward_std": 1.2885193824768066,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.75,
      "rewards/probe_completion_length/std": 0.4399413466453552,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 1.8853356838226318,
      "sampling/importance_sampling_ratio/mean": 0.9488242268562317,
      "sampling/importance_sampling_ratio/min": 0.2765178978443146,
      "sampling/sampling_logp_difference/max": 1.041520118713379,
      "sampling/sampling_logp_difference/mean": 0.07927368581295013,
      "step": 425,
      "step_time": 10.415064779000204
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.6055986508727074,
      "epoch": 0.00426,
      "grad_norm": 3.977708183811046e-05,
      "kl": 1.755710482597351,
      "learning_rate": 7.999943686712088e-06,
      "loss": 0.0,
      "step": 426,
      "step_time": 5.944202355000925
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 8.0,
      "completions/max_terminated_length": 8.0,
      "completions/mean_length": 2.40625,
      "completions/mean_terminated_length": 2.40625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.4355954229831696,
      "epoch": 0.00427,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.000142176344525069,
      "kl": 0.18295252230018377,
      "learning_rate": 7.999943397556666e-06,
      "loss": 0.0,
      "num_tokens": 10090652.0,
      "reward": 1.4187500476837158,
      "reward_std": 1.43649160861969,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.46875,
      "rewards/probe_completion_length/std": 1.43649160861969,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.3885198831558228,
      "sampling/importance_sampling_ratio/mean": 0.830295979976654,
      "sampling/importance_sampling_ratio/min": 0.39266979694366455,
      "sampling/sampling_logp_difference/max": 0.7006630897521973,
      "sampling/sampling_logp_difference/mean": 0.13479085266590118,
      "step": 427,
      "step_time": 9.895013823001136
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.435707375407219,
      "epoch": 0.00428,
      "grad_norm": 0.00015517303836531937,
      "kl": 0.18490907363593578,
      "learning_rate": 7.99994310766078e-06,
      "loss": 0.0,
      "step": 428,
      "step_time": 6.982020691000798
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 8.0,
      "completions/max_terminated_length": 8.0,
      "completions/mean_length": 2.71875,
      "completions/mean_terminated_length": 2.71875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.1079173982143402,
      "epoch": 0.00429,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 4.5960969146108255e-05,
      "kl": 0.755518133752048,
      "learning_rate": 7.999942817024428e-06,
      "loss": 0.0,
      "num_tokens": 10137753.0,
      "reward": 2.231250047683716,
      "reward_std": 1.727096676826477,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.78125,
      "rewards/probe_completion_length/std": 1.4081416130065918,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.081230640411377,
      "sampling/importance_sampling_ratio/mean": 0.8246882557868958,
      "sampling/importance_sampling_ratio/min": 0.5850779414176941,
      "sampling/sampling_logp_difference/max": 0.6071332693099976,
      "sampling/sampling_logp_difference/mean": 0.08669533580541611,
      "step": 429,
      "step_time": 9.993113438999899
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.1078582182526588,
      "epoch": 0.0043,
      "grad_norm": 7.557954086223617e-05,
      "kl": 0.7598963440395892,
      "learning_rate": 7.99994252564761e-06,
      "loss": 0.0,
      "step": 430,
      "step_time": 6.045752523999909
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 8.0,
      "completions/max_terminated_length": 8.0,
      "completions/mean_length": 3.09375,
      "completions/mean_terminated_length": 3.09375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.42129435390233994,
      "epoch": 0.00431,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00011003066174453124,
      "kl": 0.6489775236696005,
      "learning_rate": 7.999942233530327e-06,
      "loss": 0.0,
      "num_tokens": 10175885.0,
      "reward": 3.5749998092651367,
      "reward_std": 1.2636353969573975,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 2.125,
      "rewards/probe_completion_length/std": 1.099853277206421,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.25,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.75,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": 0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.341364860534668,
      "sampling/importance_sampling_ratio/mean": 0.9086952209472656,
      "sampling/importance_sampling_ratio/min": 0.49460649490356445,
      "sampling/sampling_logp_difference/max": 0.5881896615028381,
      "sampling/sampling_logp_difference/mean": 0.047025080770254135,
      "step": 431,
      "step_time": 9.545812129002115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.4164751283824444,
      "epoch": 0.00432,
      "grad_norm": 0.000159382849233225,
      "kl": 0.6406664741225541,
      "learning_rate": 7.999941940672578e-06,
      "loss": 0.0,
      "step": 432,
      "step_time": 6.606084928999735
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.59375,
      "completions/mean_terminated_length": 2.59375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8442269414663315,
      "epoch": 0.00433,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 3.960997491958551e-05,
      "kl": 0.9814293906092644,
      "learning_rate": 7.999941647074366e-06,
      "loss": 0.0,
      "num_tokens": 10219910.0,
      "reward": 2.543750047683716,
      "reward_std": 1.4560080766677856,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.59375,
      "rewards/probe_completion_length/std": 0.49899089336395264,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 0.9716951847076416,
      "sampling/importance_sampling_ratio/mean": 0.8736358284950256,
      "sampling/importance_sampling_ratio/min": 0.4931846559047699,
      "sampling/sampling_logp_difference/max": 0.4490457773208618,
      "sampling/sampling_logp_difference/mean": 0.06978647410869598,
      "step": 433,
      "step_time": 9.839958128001854
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8495101407170296,
      "epoch": 0.00434,
      "grad_norm": 3.293993358965963e-05,
      "kl": 0.9773873090744019,
      "learning_rate": 7.999941352735688e-06,
      "loss": 0.0,
      "step": 434,
      "step_time": 5.947311551000894
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.8125,
      "completions/mean_terminated_length": 2.8125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.577320072799921,
      "epoch": 0.00435,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.008683150634169579,
      "kl": 5.158851059153676,
      "learning_rate": 7.999941057656543e-06,
      "loss": -0.0,
      "num_tokens": 10264421.0,
      "reward": 3.075000047683716,
      "reward_std": 1.263635277748108,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.8125,
      "rewards/probe_completion_length/std": 0.3965577781200409,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.34375,
      "rewards/probe_shaping_dominance/std": 0.4825586974620819,
      "rewards/probe_terminal_raw/mean": 0.65625,
      "rewards/probe_terminal_raw/std": 0.4825586974620819,
      "rewards/rollout_reward_func/mean": 0.3125,
      "rewards/rollout_reward_func/std": 0.9651173949241638,
      "sampling/importance_sampling_ratio/max": 1.0832629203796387,
      "sampling/importance_sampling_ratio/mean": 0.8609358072280884,
      "sampling/importance_sampling_ratio/min": 0.0714401826262474,
      "sampling/sampling_logp_difference/max": 1.9287629127502441,
      "sampling/sampling_logp_difference/mean": 0.108085498213768,
      "step": 435,
      "step_time": 9.750276677000329
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.574884194880724,
      "epoch": 0.00436,
      "grad_norm": 0.005062948912382126,
      "kl": 3.17036358686164,
      "learning_rate": 7.999940761836937e-06,
      "loss": -0.0001,
      "step": 436,
      "step_time": 6.396997322999596
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5,
      "completions/mean_terminated_length": 2.5,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.0359537526965141,
      "epoch": 0.00437,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 3.464319524937309e-05,
      "kl": 0.48364219442009926,
      "learning_rate": 7.999940465276862e-06,
      "loss": 0.0,
      "num_tokens": 10312773.0,
      "reward": 1.9500000476837158,
      "reward_std": 1.2443419694900513,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5,
      "rewards/probe_completion_length/std": 0.5080004930496216,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 0.9713358879089355,
      "sampling/importance_sampling_ratio/mean": 0.8230075240135193,
      "sampling/importance_sampling_ratio/min": 0.5460324883460999,
      "sampling/sampling_logp_difference/max": 0.39858847856521606,
      "sampling/sampling_logp_difference/mean": 0.08530294895172119,
      "step": 437,
      "step_time": 10.383505194999088
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.0389421209692955,
      "epoch": 0.00438,
      "grad_norm": 5.601278826361522e-05,
      "kl": 0.4953901164699346,
      "learning_rate": 7.999940167976326e-06,
      "loss": 0.0,
      "step": 438,
      "step_time": 5.959065071000623
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.75,
      "completions/mean_terminated_length": 2.75,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.8178563863039017,
      "epoch": 0.00439,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0030496560502797365,
      "kl": 1.5864214673638344,
      "learning_rate": 7.999939869935323e-06,
      "loss": -0.0001,
      "num_tokens": 10361704.0,
      "reward": 2.106250047683716,
      "reward_std": 0.9873188138008118,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.75,
      "rewards/probe_completion_length/std": 0.4399413466453552,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.8125,
      "rewards/probe_shaping_dominance/std": 0.3965577781200409,
      "rewards/probe_terminal_raw/mean": 0.1875,
      "rewards/probe_terminal_raw/std": 0.3965577781200409,
      "rewards/rollout_reward_func/mean": -0.625,
      "rewards/rollout_reward_func/std": 0.7931155562400818,
      "sampling/importance_sampling_ratio/max": 2.53995418548584,
      "sampling/importance_sampling_ratio/mean": 0.8853044509887695,
      "sampling/importance_sampling_ratio/min": 0.0,
      "sampling/sampling_logp_difference/max": 1.1533396244049072,
      "sampling/sampling_logp_difference/mean": 0.18097969889640808,
      "step": 439,
      "step_time": 10.668017594998673
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.834572046995163,
      "epoch": 0.0044,
      "grad_norm": 0.003595987567678094,
      "kl": 1.560511738061905,
      "learning_rate": 7.999939571153855e-06,
      "loss": -0.0001,
      "step": 440,
      "step_time": 6.122170336001545
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.78125,
      "completions/mean_terminated_length": 2.78125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.0095400363206863,
      "epoch": 0.00441,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0025006576906889677,
      "kl": 1.4991996549069881,
      "learning_rate": 7.999939271631924e-06,
      "loss": -0.0001,
      "num_tokens": 10413488.0,
      "reward": 2.043750047683716,
      "reward_std": 0.9283830523490906,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.78125,
      "rewards/probe_completion_length/std": 0.420013427734375,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.84375,
      "rewards/probe_shaping_dominance/std": 0.3689020276069641,
      "rewards/probe_terminal_raw/mean": 0.15625,
      "rewards/probe_terminal_raw/std": 0.3689020276069641,
      "rewards/rollout_reward_func/mean": -0.6875,
      "rewards/rollout_reward_func/std": 0.7378040552139282,
      "sampling/importance_sampling_ratio/max": 1.7598189115524292,
      "sampling/importance_sampling_ratio/mean": 0.8456007242202759,
      "sampling/importance_sampling_ratio/min": 0.22255338728427887,
      "sampling/sampling_logp_difference/max": 1.387916088104248,
      "sampling/sampling_logp_difference/mean": 0.17741605639457703,
      "step": 441,
      "step_time": 10.27434647300106
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.015625,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.015625,
      "entropy": 1.0087963938713074,
      "epoch": 0.00442,
      "grad_norm": 0.0019897983875125647,
      "kl": 1.8581994846463203,
      "learning_rate": 7.999938971369529e-06,
      "loss": -0.0001,
      "step": 442,
      "step_time": 6.660500067999237
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.625,
      "completions/mean_terminated_length": 2.625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9083375707268715,
      "epoch": 0.00443,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0012449432397261262,
      "kl": 0.7166382949799299,
      "learning_rate": 7.999938670366667e-06,
      "loss": -0.0001,
      "num_tokens": 10460910.0,
      "reward": 2.450000047683716,
      "reward_std": 1.3912166357040405,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.625,
      "rewards/probe_completion_length/std": 0.49186936020851135,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5625,
      "rewards/probe_shaping_dominance/std": 0.504016101360321,
      "rewards/probe_terminal_raw/mean": 0.4375,
      "rewards/probe_terminal_raw/std": 0.504016101360321,
      "rewards/rollout_reward_func/mean": -0.125,
      "rewards/rollout_reward_func/std": 1.008032202720642,
      "sampling/importance_sampling_ratio/max": 1.1054221391677856,
      "sampling/importance_sampling_ratio/mean": 0.8311811089515686,
      "sampling/importance_sampling_ratio/min": 0.3434506058692932,
      "sampling/sampling_logp_difference/max": 1.0333223342895508,
      "sampling/sampling_logp_difference/mean": 0.09627163410186768,
      "step": 443,
      "step_time": 10.282398681000814
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.9153599664568901,
      "epoch": 0.00444,
      "grad_norm": 0.0012570996768772602,
      "kl": 0.7166626800899394,
      "learning_rate": 7.999938368623343e-06,
      "loss": -0.0001,
      "step": 444,
      "step_time": 5.903002717001982
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5625,
      "completions/mean_terminated_length": 2.5625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.057725302875042,
      "epoch": 0.00445,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.0031768870539963245,
      "kl": 1.6499460637569427,
      "learning_rate": 7.999938066139555e-06,
      "loss": -0.0001,
      "num_tokens": 10509423.0,
      "reward": 2.231250047683716,
      "reward_std": 1.3255400657653809,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5625,
      "rewards/probe_completion_length/std": 0.504016101360321,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.65625,
      "rewards/probe_shaping_dominance/std": 0.4825586974620819,
      "rewards/probe_terminal_raw/mean": 0.34375,
      "rewards/probe_terminal_raw/std": 0.4825586974620819,
      "rewards/rollout_reward_func/mean": -0.3125,
      "rewards/rollout_reward_func/std": 0.9651173949241638,
      "sampling/importance_sampling_ratio/max": 1.0972373485565186,
      "sampling/importance_sampling_ratio/mean": 0.781715989112854,
      "sampling/importance_sampling_ratio/min": 0.0,
      "sampling/sampling_logp_difference/max": 1.737563133239746,
      "sampling/sampling_logp_difference/mean": 0.16636095941066742,
      "step": 445,
      "step_time": 10.328625375999763
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 1.0501547306776047,
      "epoch": 0.00446,
      "grad_norm": 0.0023717940784990788,
      "kl": 1.9352448098361492,
      "learning_rate": 7.9999377629153e-06,
      "loss": -0.0001,
      "step": 446,
      "step_time": 7.223937839999053
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.53125,
      "completions/mean_terminated_length": 2.53125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.0991413369774818,
      "epoch": 0.00447,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00013539350766222924,
      "kl": 0.8366762921214104,
      "learning_rate": 7.999937458950583e-06,
      "loss": 0.0,
      "num_tokens": 10556571.0,
      "reward": 1.9812500476837158,
      "reward_std": 1.2309025526046753,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.53125,
      "rewards/probe_completion_length/std": 0.507007360458374,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.5306068658828735,
      "sampling/importance_sampling_ratio/mean": 0.896122932434082,
      "sampling/importance_sampling_ratio/min": 0.3056519329547882,
      "sampling/sampling_logp_difference/max": 1.0416702032089233,
      "sampling/sampling_logp_difference/mean": 0.12326531112194061,
      "step": 447,
      "step_time": 10.101042933999452
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.0788709297776222,
      "epoch": 0.00448,
      "grad_norm": 0.00021080716396681964,
      "kl": 0.8710470348596573,
      "learning_rate": 7.999937154245402e-06,
      "loss": 0.0,
      "step": 448,
      "step_time": 6.108521285000279
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.8125,
      "completions/mean_terminated_length": 2.8125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.5352600961923599,
      "epoch": 0.00449,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 1.1860423001053277e-05,
      "kl": 0.4157614109572023,
      "learning_rate": 7.999936848799756e-06,
      "loss": 0.0,
      "num_tokens": 10598609.0,
      "reward": 3.262500047683716,
      "reward_std": 1.2296733856201172,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.8125,
      "rewards/probe_completion_length/std": 0.3965577781200409,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.25,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.75,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": 0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 0.9666202664375305,
      "sampling/importance_sampling_ratio/mean": 0.9112852215766907,
      "sampling/importance_sampling_ratio/min": 0.6785628199577332,
      "sampling/sampling_logp_difference/max": 0.21126577258110046,
      "sampling/sampling_logp_difference/mean": 0.03843056410551071,
      "step": 449,
      "step_time": 9.731360219000635
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.5475796833634377,
      "epoch": 0.0045,
      "grad_norm": 9.530402167001739e-06,
      "kl": 0.4151203960645944,
      "learning_rate": 7.999936542613647e-06,
      "loss": 0.0,
      "step": 450,
      "step_time": 6.8167696039990915
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 8.0,
      "completions/max_terminated_length": 8.0,
      "completions/mean_length": 2.59375,
      "completions/mean_terminated_length": 2.59375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.4937551617622375,
      "epoch": 0.00451,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0027509112842381,
      "kl": 0.8361366703175008,
      "learning_rate": 7.999936235687075e-06,
      "loss": -0.0001,
      "num_tokens": 10644998.0,
      "reward": 2.043750047683716,
      "reward_std": 1.7479827404022217,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.65625,
      "rewards/probe_completion_length/std": 1.4280457496643066,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.78125,
      "rewards/probe_shaping_dominance/std": 0.420013427734375,
      "rewards/probe_terminal_raw/mean": 0.21875,
      "rewards/probe_terminal_raw/std": 0.420013427734375,
      "rewards/rollout_reward_func/mean": -0.5625,
      "rewards/rollout_reward_func/std": 0.84002685546875,
      "sampling/importance_sampling_ratio/max": 2.1693289279937744,
      "sampling/importance_sampling_ratio/mean": 0.8219958543777466,
      "sampling/importance_sampling_ratio/min": 0.3935191333293915,
      "sampling/sampling_logp_difference/max": 0.8192603588104248,
      "sampling/sampling_logp_difference/mean": 0.15191489458084106,
      "step": 451,
      "step_time": 9.84771198999988
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.4842835664749146,
      "epoch": 0.00452,
      "grad_norm": 0.0019148472929373384,
      "kl": 0.8515774551779032,
      "learning_rate": 7.999935928020036e-06,
      "loss": -0.0001,
      "step": 452,
      "step_time": 5.956881039999644
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5,
      "completions/mean_terminated_length": 2.5,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.2700788527727127,
      "epoch": 0.00453,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 7.074048335198313e-05,
      "kl": 0.7058316059410572,
      "learning_rate": 7.999935619612536e-06,
      "loss": 0.0,
      "num_tokens": 10694855.0,
      "reward": 1.9500000476837158,
      "reward_std": 1.2443419694900513,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5,
      "rewards/probe_completion_length/std": 0.5080004930496216,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.1525969505310059,
      "sampling/importance_sampling_ratio/mean": 0.8169641494750977,
      "sampling/importance_sampling_ratio/min": 0.5228918194770813,
      "sampling/sampling_logp_difference/max": 0.5732545852661133,
      "sampling/sampling_logp_difference/mean": 0.11736132204532623,
      "step": 453,
      "step_time": 9.987618331999329
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.278110608458519,
      "epoch": 0.00454,
      "grad_norm": 7.446208473993465e-05,
      "kl": 0.7188287526369095,
      "learning_rate": 7.999935310464572e-06,
      "loss": 0.0,
      "step": 454,
      "step_time": 7.042841277000662
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.84375,
      "completions/mean_terminated_length": 2.84375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.6826481185853481,
      "epoch": 0.00455,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0026605785824358463,
      "kl": 3.3382283598184586,
      "learning_rate": 7.999935000576144e-06,
      "loss": -0.0,
      "num_tokens": 10740810.0,
      "reward": 2.731250047683716,
      "reward_std": 1.2110878229141235,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.84375,
      "rewards/probe_completion_length/std": 0.3689020276069641,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.53125,
      "rewards/probe_shaping_dominance/std": 0.507007360458374,
      "rewards/probe_terminal_raw/mean": 0.46875,
      "rewards/probe_terminal_raw/std": 0.507007360458374,
      "rewards/rollout_reward_func/mean": -0.0625,
      "rewards/rollout_reward_func/std": 1.014014720916748,
      "sampling/importance_sampling_ratio/max": 1.809893012046814,
      "sampling/importance_sampling_ratio/mean": 0.8945350050926208,
      "sampling/importance_sampling_ratio/min": 0.2569253146648407,
      "sampling/sampling_logp_difference/max": 1.2167439460754395,
      "sampling/sampling_logp_difference/mean": 0.08854126930236816,
      "step": 455,
      "step_time": 10.07339215400043
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.6866127476096153,
      "epoch": 0.00456,
      "grad_norm": 0.001385181793011725,
      "kl": 3.1702629774808884,
      "learning_rate": 7.999934689947254e-06,
      "loss": -0.0,
      "step": 456,
      "step_time": 6.053322766999372
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.31620290130376816,
      "epoch": 0.00457,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 5.6734403187874705e-05,
      "kl": 1.541536584496498,
      "learning_rate": 7.999934378577899e-06,
      "loss": 0.0,
      "num_tokens": 10783603.0,
      "reward": 2.950000047683716,
      "reward_std": 1.0160009860992432,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 2.0,
      "rewards/probe_completion_length/std": 0.0,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 1.371078372001648,
      "sampling/importance_sampling_ratio/mean": 0.9306595921516418,
      "sampling/importance_sampling_ratio/min": 0.1436803787946701,
      "sampling/sampling_logp_difference/max": 1.6921932697296143,
      "sampling/sampling_logp_difference/mean": 0.06238972768187523,
      "step": 457,
      "step_time": 10.153244150002138
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.31905546225607395,
      "epoch": 0.00458,
      "grad_norm": 4.7455170715693384e-05,
      "kl": 1.537596296519041,
      "learning_rate": 7.999934066468082e-06,
      "loss": 0.0,
      "step": 458,
      "step_time": 6.367441510999015
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.78125,
      "completions/mean_terminated_length": 2.78125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.675567265599966,
      "epoch": 0.00459,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 6.736534123774618e-05,
      "kl": 1.7775992900133133,
      "learning_rate": 7.9999337536178e-06,
      "loss": 0.0,
      "num_tokens": 10828138.0,
      "reward": 2.731250047683716,
      "reward_std": 1.2885193824768066,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.78125,
      "rewards/probe_completion_length/std": 0.420013427734375,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 1.9140326976776123,
      "sampling/importance_sampling_ratio/mean": 0.9304972887039185,
      "sampling/importance_sampling_ratio/min": 0.31738898158073425,
      "sampling/sampling_logp_difference/max": 0.7054009437561035,
      "sampling/sampling_logp_difference/mean": 0.08023737370967865,
      "step": 459,
      "step_time": 9.88763274899975
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.6782685145735741,
      "epoch": 0.0046,
      "grad_norm": 6.407535693142563e-05,
      "kl": 1.7728114277124405,
      "learning_rate": 7.999933440027056e-06,
      "loss": 0.0,
      "step": 460,
      "step_time": 5.971275758998672
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.65625,
      "completions/mean_terminated_length": 2.65625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.009953811764717,
      "epoch": 0.00461,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00026355075533501804,
      "kl": 0.9404683746397495,
      "learning_rate": 7.999933125695849e-06,
      "loss": 0.0,
      "num_tokens": 10876467.0,
      "reward": 2.106250047683716,
      "reward_std": 1.1670026779174805,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.65625,
      "rewards/probe_completion_length/std": 0.4825586974620819,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 2.7634549140930176,
      "sampling/importance_sampling_ratio/mean": 0.867573082447052,
      "sampling/importance_sampling_ratio/min": 0.36806216835975647,
      "sampling/sampling_logp_difference/max": 1.037782907485962,
      "sampling/sampling_logp_difference/mean": 0.11394397169351578,
      "step": 461,
      "step_time": 11.22426544300015
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.0309009402990341,
      "epoch": 0.00462,
      "grad_norm": 0.00029655409161932766,
      "kl": 0.9389306157827377,
      "learning_rate": 7.99993281062418e-06,
      "loss": 0.0,
      "step": 462,
      "step_time": 6.238317131001168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5625,
      "completions/mean_terminated_length": 2.5625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9029425978660583,
      "epoch": 0.00463,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 3.5131532058585435e-05,
      "kl": 0.7519008315866813,
      "learning_rate": 7.999932494812047e-06,
      "loss": 0.0,
      "num_tokens": 10918254.0,
      "reward": 2.012500047683716,
      "reward_std": 1.2164862155914307,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5625,
      "rewards/probe_completion_length/std": 0.504016101360321,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 2.604414224624634,
      "sampling/importance_sampling_ratio/mean": 0.9013475775718689,
      "sampling/importance_sampling_ratio/min": 0.5035668611526489,
      "sampling/sampling_logp_difference/max": 0.8525474071502686,
      "sampling/sampling_logp_difference/mean": 0.09090504050254822,
      "step": 463,
      "step_time": 9.717574374998549
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.9116902351379395,
      "epoch": 0.00464,
      "grad_norm": 3.008692874573171e-05,
      "kl": 0.7550733370007947,
      "learning_rate": 7.999932178259451e-06,
      "loss": 0.0,
      "step": 464,
      "step_time": 6.374117108998689
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.75,
      "completions/mean_terminated_length": 2.75,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.7433590814471245,
      "epoch": 0.00465,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.008980808779597282,
      "kl": 4.237264834344387,
      "learning_rate": 7.999931860966393e-06,
      "loss": 0.0001,
      "num_tokens": 10967792.0,
      "reward": 1.7000000476837158,
      "reward_std": 0.4399413466453552,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.75,
      "rewards/probe_completion_length/std": 0.4399413466453552,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 2.894777774810791,
      "sampling/importance_sampling_ratio/mean": 0.8646867871284485,
      "sampling/importance_sampling_ratio/min": 0.03391490504145622,
      "sampling/sampling_logp_difference/max": 3.4479689598083496,
      "sampling/sampling_logp_difference/mean": 0.24308747053146362,
      "step": 465,
      "step_time": 10.31518953199975
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.7749368958175182,
      "epoch": 0.00466,
      "grad_norm": 0.001255774055607617,
      "kl": 2.4550288692116737,
      "learning_rate": 7.999931542932872e-06,
      "loss": 0.0,
      "step": 466,
      "step_time": 6.806484487000489
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 8.0,
      "completions/max_terminated_length": 8.0,
      "completions/mean_length": 2.4375,
      "completions/mean_terminated_length": 2.4375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.4020432382822037,
      "epoch": 0.00467,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 4.1622817661846057e-05,
      "kl": 0.062206802947912365,
      "learning_rate": 7.999931224158886e-06,
      "loss": 0.0,
      "num_tokens": 11014060.0,
      "reward": 1.4500000476837158,
      "reward_std": 1.436842441558838,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5,
      "rewards/probe_completion_length/std": 1.436842441558838,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.131276249885559,
      "sampling/importance_sampling_ratio/mean": 0.7747564315795898,
      "sampling/importance_sampling_ratio/min": 0.3255142569541931,
      "sampling/sampling_logp_difference/max": 0.6411696076393127,
      "sampling/sampling_logp_difference/mean": 0.135621577501297,
      "step": 467,
      "step_time": 9.957253848999244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.4081729501485825,
      "epoch": 0.00468,
      "grad_norm": 4.6337001549545676e-05,
      "kl": 0.06316643842728809,
      "learning_rate": 7.999930904644442e-06,
      "loss": 0.0,
      "step": 468,
      "step_time": 6.577253157999621
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1130.0,
      "completions/max_terminated_length": 1130.0,
      "completions/mean_length": 37.96875,
      "completions/mean_terminated_length": 37.96875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.1360742896795273,
      "epoch": 0.00469,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.00010231900523649529,
      "kl": 1.0736553617753088,
      "learning_rate": 7.999930584389531e-06,
      "loss": 0.0,
      "num_tokens": 11066179.0,
      "reward": 104.69999694824219,
      "reward_std": 579.4194946289062,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 104.25,
      "rewards/probe_completion_length/std": 579.5098876953125,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.8770849704742432,
      "sampling/importance_sampling_ratio/mean": 0.8900884389877319,
      "sampling/importance_sampling_ratio/min": 0.0,
      "sampling/sampling_logp_difference/max": 2.822144031524658,
      "sampling/sampling_logp_difference/mean": 0.27112311124801636,
      "step": 469,
      "step_time": 23.543196358000387
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.1283305957913399,
      "epoch": 0.0047,
      "grad_norm": 0.00010479523916728795,
      "kl": 1.07857805211097,
      "learning_rate": 7.999930263394161e-06,
      "loss": 0.0,
      "step": 470,
      "step_time": 9.207572682000318
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.8125,
      "completions/mean_terminated_length": 2.8125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.6140802390873432,
      "epoch": 0.00471,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 8.397161582252011e-05,
      "kl": 1.2529713734984398,
      "learning_rate": 7.999929941658327e-06,
      "loss": 0.0,
      "num_tokens": 11108801.0,
      "reward": 2.762500047683716,
      "reward_std": 1.2556325197219849,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.8125,
      "rewards/probe_completion_length/std": 0.3965577781200409,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 2.2854928970336914,
      "sampling/importance_sampling_ratio/mean": 0.9871470928192139,
      "sampling/importance_sampling_ratio/min": 0.4138258695602417,
      "sampling/sampling_logp_difference/max": 0.8604846000671387,
      "sampling/sampling_logp_difference/mean": 0.08210620284080505,
      "step": 471,
      "step_time": 9.92915231400002
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.6049374490976334,
      "epoch": 0.00472,
      "grad_norm": 5.9758218412753195e-05,
      "kl": 1.2381772547960281,
      "learning_rate": 7.999929619182034e-06,
      "loss": 0.0,
      "step": 472,
      "step_time": 6.531484164000176
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.78125,
      "completions/mean_terminated_length": 2.78125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.6399192288517952,
      "epoch": 0.00473,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0001346557110082358,
      "kl": 1.022419050335884,
      "learning_rate": 7.999929295965276e-06,
      "loss": 0.0,
      "num_tokens": 11154060.0,
      "reward": 2.731250047683716,
      "reward_std": 1.2885193824768066,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.78125,
      "rewards/probe_completion_length/std": 0.420013427734375,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 1.9041831493377686,
      "sampling/importance_sampling_ratio/mean": 0.9148904085159302,
      "sampling/importance_sampling_ratio/min": 0.41519424319267273,
      "sampling/sampling_logp_difference/max": 0.7960150241851807,
      "sampling/sampling_logp_difference/mean": 0.09308066964149475,
      "step": 473,
      "step_time": 10.540310639998097
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.646208006888628,
      "epoch": 0.00474,
      "grad_norm": 7.723711314611137e-05,
      "kl": 0.993750736117363,
      "learning_rate": 7.999928972008055e-06,
      "loss": 0.0,
      "step": 474,
      "step_time": 6.0534506069980125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 7.0,
      "completions/max_terminated_length": 7.0,
      "completions/mean_length": 2.78125,
      "completions/mean_terminated_length": 2.78125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9178853631019592,
      "epoch": 0.00475,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007165942224673927,
      "kl": 0.6427033228101209,
      "learning_rate": 7.999928647310374e-06,
      "loss": 0.0,
      "num_tokens": 11202464.0,
      "reward": 1.7937500476837158,
      "reward_std": 1.221035361289978,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.84375,
      "rewards/probe_completion_length/std": 1.221035361289978,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.1609777212142944,
      "sampling/importance_sampling_ratio/mean": 0.8023998737335205,
      "sampling/importance_sampling_ratio/min": 0.39279136061668396,
      "sampling/sampling_logp_difference/max": 0.8687930107116699,
      "sampling/sampling_logp_difference/mean": 0.12288682162761688,
      "step": 475,
      "step_time": 10.169633613001679
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.9118505492806435,
      "epoch": 0.00476,
      "grad_norm": 0.000408078107284382,
      "kl": 0.5350325474282727,
      "learning_rate": 7.99992832187223e-06,
      "loss": 0.0,
      "step": 476,
      "step_time": 6.567168232002587
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.40625,
      "completions/mean_terminated_length": 2.40625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.200137197971344,
      "epoch": 0.00477,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 7.701815047767013e-05,
      "kl": 0.7950491067022085,
      "learning_rate": 7.999927995693626e-06,
      "loss": 0.0,
      "num_tokens": 11251359.0,
      "reward": 1.3562500476837158,
      "reward_std": 0.498990923166275,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.40625,
      "rewards/probe_completion_length/std": 0.49899089336395264,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 2.108764886856079,
      "sampling/importance_sampling_ratio/mean": 0.8880026340484619,
      "sampling/importance_sampling_ratio/min": 0.40861380100250244,
      "sampling/sampling_logp_difference/max": 0.7756710052490234,
      "sampling/sampling_logp_difference/mean": 0.142902672290802,
      "step": 477,
      "step_time": 10.48534435199872
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.1992550939321518,
      "epoch": 0.00478,
      "grad_norm": 7.540653314208612e-05,
      "kl": 0.7962087821215391,
      "learning_rate": 7.999927668774559e-06,
      "loss": 0.0,
      "step": 478,
      "step_time": 6.0412125680004465
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.78125,
      "completions/mean_terminated_length": 2.78125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.5685002394020557,
      "epoch": 0.00479,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 1.6812458852655254e-05,
      "kl": 0.6405358919873834,
      "learning_rate": 7.99992734111503e-06,
      "loss": 0.0,
      "num_tokens": 11296370.0,
      "reward": 3.231250047683716,
      "reward_std": 1.2759405374526978,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.78125,
      "rewards/probe_completion_length/std": 0.420013427734375,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.25,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.75,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": 0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.0279954671859741,
      "sampling/importance_sampling_ratio/mean": 0.9103195667266846,
      "sampling/importance_sampling_ratio/min": 0.6296721696853638,
      "sampling/sampling_logp_difference/max": 0.38632071018218994,
      "sampling/sampling_logp_difference/mean": 0.04309441149234772,
      "step": 479,
      "step_time": 9.993394022998473
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.5688362512737513,
      "epoch": 0.0048,
      "grad_norm": 1.9768529455177486e-05,
      "kl": 0.6405658056028187,
      "learning_rate": 7.99992701271504e-06,
      "loss": 0.0,
      "step": 480,
      "step_time": 6.553862806000325
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.6740151382982731,
      "epoch": 0.00481,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0008734637522138655,
      "kl": 2.5446321358904243,
      "learning_rate": 7.99992668357459e-06,
      "loss": -0.0,
      "num_tokens": 11344807.0,
      "reward": 2.762500047683716,
      "reward_std": 1.1760376691818237,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.53125,
      "rewards/probe_shaping_dominance/std": 0.507007360458374,
      "rewards/probe_terminal_raw/mean": 0.46875,
      "rewards/probe_terminal_raw/std": 0.507007360458374,
      "rewards/rollout_reward_func/mean": -0.0625,
      "rewards/rollout_reward_func/std": 1.014014720916748,
      "sampling/importance_sampling_ratio/max": 2.2089757919311523,
      "sampling/importance_sampling_ratio/mean": 0.9335870146751404,
      "sampling/importance_sampling_ratio/min": 0.3754686415195465,
      "sampling/sampling_logp_difference/max": 0.992009162902832,
      "sampling/sampling_logp_difference/mean": 0.08749516308307648,
      "step": 481,
      "step_time": 10.437285636000524
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.6751360222697258,
      "epoch": 0.00482,
      "grad_norm": 0.0008649210212752223,
      "kl": 2.542562405578792,
      "learning_rate": 7.999926353693675e-06,
      "loss": -0.0,
      "step": 482,
      "step_time": 5.9687393949998295
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.59375,
      "completions/mean_terminated_length": 2.59375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.018196739256382,
      "epoch": 0.00483,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0012504520127549767,
      "kl": 1.1826329734176397,
      "learning_rate": 7.999926023072302e-06,
      "loss": -0.0,
      "num_tokens": 11392337.0,
      "reward": 1.9812500476837158,
      "reward_std": 1.1495966911315918,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.59375,
      "rewards/probe_completion_length/std": 0.49899089336395264,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.78125,
      "rewards/probe_shaping_dominance/std": 0.420013427734375,
      "rewards/probe_terminal_raw/mean": 0.21875,
      "rewards/probe_terminal_raw/std": 0.420013427734375,
      "rewards/rollout_reward_func/mean": -0.5625,
      "rewards/rollout_reward_func/std": 0.84002685546875,
      "sampling/importance_sampling_ratio/max": 1.1086639165878296,
      "sampling/importance_sampling_ratio/mean": 0.8167065382003784,
      "sampling/importance_sampling_ratio/min": 0.43818601965904236,
      "sampling/sampling_logp_difference/max": 0.6695658564567566,
      "sampling/sampling_logp_difference/mean": 0.11686171591281891,
      "step": 483,
      "step_time": 10.05440507900039
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.0212653428316116,
      "epoch": 0.00484,
      "grad_norm": 0.0011736839078366756,
      "kl": 1.1902284994721413,
      "learning_rate": 7.999925691710467e-06,
      "loss": -0.0,
      "step": 484,
      "step_time": 6.5248691330007205
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.71875,
      "completions/mean_terminated_length": 2.71875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.051191158592701,
      "epoch": 0.00485,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0008444490376859903,
      "kl": 1.4066885709762573,
      "learning_rate": 7.99992535960817e-06,
      "loss": 0.0,
      "num_tokens": 11444223.0,
      "reward": 1.6687500476837158,
      "reward_std": 0.45680341124534607,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.71875,
      "rewards/probe_completion_length/std": 0.45680341124534607,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 2.790355920791626,
      "sampling/importance_sampling_ratio/mean": 0.8630131483078003,
      "sampling/importance_sampling_ratio/min": 0.07368746399879456,
      "sampling/sampling_logp_difference/max": 1.5465178489685059,
      "sampling/sampling_logp_difference/mean": 0.1692320853471756,
      "step": 485,
      "step_time": 10.868371823001326
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.0457961112260818,
      "epoch": 0.00486,
      "grad_norm": 0.0008401769446209073,
      "kl": 1.312499899417162,
      "learning_rate": 7.999925026765412e-06,
      "loss": 0.0,
      "step": 486,
      "step_time": 6.268463713998244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.625,
      "completions/mean_terminated_length": 2.625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.7576271295547485,
      "epoch": 0.00487,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0004383134946692735,
      "kl": 1.3531483765691519,
      "learning_rate": 7.999924693182194e-06,
      "loss": 0.0,
      "num_tokens": 11487384.0,
      "reward": 1.5750000476837158,
      "reward_std": 0.49186936020851135,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.625,
      "rewards/probe_completion_length/std": 0.49186936020851135,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.2666070461273193,
      "sampling/importance_sampling_ratio/mean": 0.8539038896560669,
      "sampling/importance_sampling_ratio/min": 0.26787251234054565,
      "sampling/sampling_logp_difference/max": 1.2270400524139404,
      "sampling/sampling_logp_difference/mean": 0.10817140340805054,
      "step": 487,
      "step_time": 9.842391171998315
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.7594375871121883,
      "epoch": 0.00488,
      "grad_norm": 0.0006697545177303255,
      "kl": 1.3161140335723758,
      "learning_rate": 7.999924358858514e-06,
      "loss": 0.0,
      "step": 488,
      "step_time": 6.4828962940000565
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 8.0,
      "completions/max_terminated_length": 8.0,
      "completions/mean_length": 2.96875,
      "completions/mean_terminated_length": 2.96875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.883793793618679,
      "epoch": 0.00489,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0034089318942278624,
      "kl": 0.3923273850377882,
      "learning_rate": 7.999924023794374e-06,
      "loss": -0.0001,
      "num_tokens": 11536908.0,
      "reward": 2.6687498092651367,
      "reward_std": 1.6310372352600098,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 2.03125,
      "rewards/probe_completion_length/std": 1.331610083580017,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.65625,
      "rewards/probe_shaping_dominance/std": 0.4825586974620819,
      "rewards/probe_terminal_raw/mean": 0.34375,
      "rewards/probe_terminal_raw/std": 0.4825586974620819,
      "rewards/rollout_reward_func/mean": -0.3125,
      "rewards/rollout_reward_func/std": 0.9651173949241638,
      "sampling/importance_sampling_ratio/max": 1.9263346195220947,
      "sampling/importance_sampling_ratio/mean": 0.8542444109916687,
      "sampling/importance_sampling_ratio/min": 0.4926546812057495,
      "sampling/sampling_logp_difference/max": 0.520141065120697,
      "sampling/sampling_logp_difference/mean": 0.09610345959663391,
      "step": 489,
      "step_time": 10.60969110699989
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.8811840042471886,
      "epoch": 0.0049,
      "grad_norm": 0.0034968475811183453,
      "kl": 0.3980398298444925,
      "learning_rate": 7.999923687989774e-06,
      "loss": -0.0001,
      "step": 490,
      "step_time": 6.079914465999536
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.2838631570339203,
      "epoch": 0.00491,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0006703120889142156,
      "kl": 2.198386996984482,
      "learning_rate": 7.999923351444713e-06,
      "loss": -0.0,
      "num_tokens": 11585701.0,
      "reward": 3.887500047683716,
      "reward_std": 0.3535533845424652,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 2.0,
      "rewards/probe_completion_length/std": 0.0,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.03125,
      "rewards/probe_shaping_dominance/std": 0.1767766922712326,
      "rewards/probe_terminal_raw/mean": 0.96875,
      "rewards/probe_terminal_raw/std": 0.1767766922712326,
      "rewards/rollout_reward_func/mean": 0.9375,
      "rewards/rollout_reward_func/std": 0.3535533845424652,
      "sampling/importance_sampling_ratio/max": 1.039068341255188,
      "sampling/importance_sampling_ratio/mean": 0.9455829858779907,
      "sampling/importance_sampling_ratio/min": 0.30147337913513184,
      "sampling/sampling_logp_difference/max": 1.0723538398742676,
      "sampling/sampling_logp_difference/mean": 0.033339060842990875,
      "step": 491,
      "step_time": 9.407341567999538
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.2829307820647955,
      "epoch": 0.00492,
      "grad_norm": 0.00041976518696174026,
      "kl": 2.2534179985523224,
      "learning_rate": 7.99992301415919e-06,
      "loss": -0.0,
      "step": 492,
      "step_time": 6.122673765999025
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.59375,
      "completions/mean_terminated_length": 2.59375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9248806834220886,
      "epoch": 0.00493,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 5.793804666609503e-05,
      "kl": 0.2539625926874578,
      "learning_rate": 7.999922676133208e-06,
      "loss": 0.0,
      "num_tokens": 11631151.0,
      "reward": 2.043750047683716,
      "reward_std": 1.201058030128479,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.59375,
      "rewards/probe_completion_length/std": 0.49899089336395264,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.1576757431030273,
      "sampling/importance_sampling_ratio/mean": 0.8414940237998962,
      "sampling/importance_sampling_ratio/min": 0.6457847356796265,
      "sampling/sampling_logp_difference/max": 0.37067556381225586,
      "sampling/sampling_logp_difference/mean": 0.0808270201086998,
      "step": 493,
      "step_time": 10.59047884300162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.9139997959136963,
      "epoch": 0.00494,
      "grad_norm": 5.656076973536983e-05,
      "kl": 0.2586546838283539,
      "learning_rate": 7.999922337366765e-06,
      "loss": 0.0,
      "step": 494,
      "step_time": 6.090953175001232
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5,
      "completions/mean_terminated_length": 2.5,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.7931460365653038,
      "epoch": 0.00495,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 6.027599738445133e-05,
      "kl": 1.1627589538693428,
      "learning_rate": 7.99992199785986e-06,
      "loss": 0.0,
      "num_tokens": 11676700.0,
      "reward": 2.450000047683716,
      "reward_std": 1.5240014791488647,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5,
      "rewards/probe_completion_length/std": 0.5080004930496216,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 1.3997493982315063,
      "sampling/importance_sampling_ratio/mean": 0.8892567753791809,
      "sampling/importance_sampling_ratio/min": 0.4850221574306488,
      "sampling/sampling_logp_difference/max": 0.6389734148979187,
      "sampling/sampling_logp_difference/mean": 0.07556334882974625,
      "step": 495,
      "step_time": 10.246895737998784
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.7806414198130369,
      "epoch": 0.00496,
      "grad_norm": 6.212398147908971e-05,
      "kl": 1.1609262004494667,
      "learning_rate": 7.999921657612498e-06,
      "loss": 0.0,
      "step": 496,
      "step_time": 6.424024951001229
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5625,
      "completions/mean_terminated_length": 2.5625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.1086363270878792,
      "epoch": 0.00497,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0006601955974474549,
      "kl": 0.9634743640199304,
      "learning_rate": 7.999921316624673e-06,
      "loss": 0.0,
      "num_tokens": 11723406.0,
      "reward": 1.5125000476837158,
      "reward_std": 0.504016101360321,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5625,
      "rewards/probe_completion_length/std": 0.504016101360321,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.6105371713638306,
      "sampling/importance_sampling_ratio/mean": 0.8632278442382812,
      "sampling/importance_sampling_ratio/min": 0.45380502939224243,
      "sampling/sampling_logp_difference/max": 0.7139332294464111,
      "sampling/sampling_logp_difference/mean": 0.11729858815670013,
      "step": 497,
      "step_time": 9.990106498999012
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.1224614009261131,
      "epoch": 0.00498,
      "grad_norm": 0.0005421527894213796,
      "kl": 0.9261851194314659,
      "learning_rate": 7.99992097489639e-06,
      "loss": 0.0,
      "step": 498,
      "step_time": 6.056653904000996
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 7.0,
      "completions/max_terminated_length": 7.0,
      "completions/mean_length": 2.625,
      "completions/mean_terminated_length": 2.625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.22955272346735,
      "epoch": 0.00499,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0019598952494561672,
      "kl": 1.2435842752456665,
      "learning_rate": 7.999920632427647e-06,
      "loss": -0.0001,
      "num_tokens": 11774598.0,
      "reward": 2.043750047683716,
      "reward_std": 1.4888700246810913,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.65625,
      "rewards/probe_completion_length/std": 1.0957211256027222,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.78125,
      "rewards/probe_shaping_dominance/std": 0.420013427734375,
      "rewards/probe_terminal_raw/mean": 0.21875,
      "rewards/probe_terminal_raw/std": 0.420013427734375,
      "rewards/rollout_reward_func/mean": -0.5625,
      "rewards/rollout_reward_func/std": 0.84002685546875,
      "sampling/importance_sampling_ratio/max": 1.4074516296386719,
      "sampling/importance_sampling_ratio/mean": 0.8869490623474121,
      "sampling/importance_sampling_ratio/min": 0.37190303206443787,
      "sampling/sampling_logp_difference/max": 0.7668733596801758,
      "sampling/sampling_logp_difference/mean": 0.14819347858428955,
      "step": 499,
      "step_time": 10.845204309999644
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 1.2178536430001259,
      "epoch": 0.005,
      "grad_norm": 0.0005243890336714685,
      "kl": 1.2198392376303673,
      "learning_rate": 7.999920289218444e-06,
      "loss": -0.0001,
      "step": 500,
      "step_time": 6.757824534000065
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.84375,
      "completions/mean_terminated_length": 2.84375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.6885988488793373,
      "epoch": 0.00501,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0009279325022362173,
      "kl": 2.073035791516304,
      "learning_rate": 7.999919945268779e-06,
      "loss": 0.0,
      "num_tokens": 11821828.0,
      "reward": 2.793750047683716,
      "reward_std": 1.221035361289978,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.84375,
      "rewards/probe_completion_length/std": 0.3689020276069641,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 1.5924896001815796,
      "sampling/importance_sampling_ratio/mean": 0.7612407207489014,
      "sampling/importance_sampling_ratio/min": 0.2523704469203949,
      "sampling/sampling_logp_difference/max": 0.8371151685714722,
      "sampling/sampling_logp_difference/mean": 0.1486271321773529,
      "step": 501,
      "step_time": 10.10802043700005
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.6844572648406029,
      "epoch": 0.00502,
      "grad_norm": 0.0008749505504965782,
      "kl": 2.072124183177948,
      "learning_rate": 7.999919600578657e-06,
      "loss": 0.0,
      "step": 502,
      "step_time": 6.110820461997719
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.5625,
      "completions/mean_terminated_length": 2.5625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9634811952710152,
      "epoch": 0.00503,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 6.934843258932233e-05,
      "kl": 0.6088187689892948,
      "learning_rate": 7.999919255148074e-06,
      "loss": 0.0,
      "num_tokens": 11868393.0,
      "reward": 2.043750047683716,
      "reward_std": 1.201058030128479,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.5625,
      "rewards/probe_completion_length/std": 0.504016101360321,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.75,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.25,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": -0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 2.7559914588928223,
      "sampling/importance_sampling_ratio/mean": 0.9101375937461853,
      "sampling/importance_sampling_ratio/min": 0.4674367308616638,
      "sampling/sampling_logp_difference/max": 1.3814847469329834,
      "sampling/sampling_logp_difference/mean": 0.11712859570980072,
      "step": 503,
      "step_time": 10.467418278000878
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.9756656736135483,
      "epoch": 0.00504,
      "grad_norm": 6.629782728850842e-05,
      "kl": 0.6043889897409827,
      "learning_rate": 7.999918908977031e-06,
      "loss": 0.0,
      "step": 504,
      "step_time": 6.49657412599845
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.75,
      "completions/mean_terminated_length": 2.75,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.9048156104981899,
      "epoch": 0.00505,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 9.591080743120983e-05,
      "kl": 0.7056189067661762,
      "learning_rate": 7.999918562065531e-06,
      "loss": 0.0,
      "num_tokens": 11914091.0,
      "reward": 2.731250047683716,
      "reward_std": 1.3133158683776855,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.78125,
      "rewards/probe_completion_length/std": 0.4908435642719269,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 1.8559684753417969,
      "sampling/importance_sampling_ratio/mean": 0.8754900097846985,
      "sampling/importance_sampling_ratio/min": 0.41432133316993713,
      "sampling/sampling_logp_difference/max": 0.8816030025482178,
      "sampling/sampling_logp_difference/mean": 0.09118005633354187,
      "step": 505,
      "step_time": 10.203277341001012
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.9119976498186588,
      "epoch": 0.00506,
      "grad_norm": 6.370534538291395e-05,
      "kl": 0.6997845359146595,
      "learning_rate": 7.999918214413569e-06,
      "loss": 0.0,
      "step": 506,
      "step_time": 6.172720281999318
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.3125,
      "completions/mean_terminated_length": 2.3125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 1.5444683581590652,
      "epoch": 0.00507,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0025572467129677534,
      "kl": 1.402412122581154,
      "learning_rate": 7.999917866021148e-06,
      "loss": 0.0,
      "num_tokens": 11963464.0,
      "reward": 1.2625000476837158,
      "reward_std": 0.4709290862083435,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.3125,
      "rewards/probe_completion_length/std": 0.4709290862083435,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -1.0,
      "rewards/rollout_reward_func/std": 0.0,
      "sampling/importance_sampling_ratio/max": 1.1553281545639038,
      "sampling/importance_sampling_ratio/mean": 0.7280164361000061,
      "sampling/importance_sampling_ratio/min": 0.22703957557678223,
      "sampling/sampling_logp_difference/max": 0.753678560256958,
      "sampling/sampling_logp_difference/mean": 0.1670120656490326,
      "step": 507,
      "step_time": 10.870288863999122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 1.5414555668830872,
      "epoch": 0.00508,
      "grad_norm": 0.001770776347257197,
      "kl": 1.0420573372393847,
      "learning_rate": 7.999917516888269e-06,
      "loss": 0.0,
      "step": 508,
      "step_time": 6.611128889000611
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.9375,
      "completions/mean_terminated_length": 2.9375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.5182851850986481,
      "epoch": 0.00509,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0007590521709062159,
      "kl": 2.343875676393509,
      "learning_rate": 7.99991716701493e-06,
      "loss": 0.0,
      "num_tokens": 12009204.0,
      "reward": 2.887500047683716,
      "reward_std": 1.1053389310836792,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.9375,
      "rewards/probe_completion_length/std": 0.24593468010425568,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 2.407043695449829,
      "sampling/importance_sampling_ratio/mean": 0.9590823650360107,
      "sampling/importance_sampling_ratio/min": 0.3875659108161926,
      "sampling/sampling_logp_difference/max": 0.8325726985931396,
      "sampling/sampling_logp_difference/mean": 0.10490264743566513,
      "step": 509,
      "step_time": 9.980003214000135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.5325130149722099,
      "epoch": 0.0051,
      "grad_norm": 0.0007388418889604509,
      "kl": 2.3545358180999756,
      "learning_rate": 7.999916816401132e-06,
      "loss": 0.0,
      "step": 510,
      "step_time": 6.473959976001424
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.8125,
      "completions/mean_terminated_length": 2.8125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.665569581091404,
      "epoch": 0.00511,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0003944501222576946,
      "kl": 1.5593384206295013,
      "learning_rate": 7.999916465046877e-06,
      "loss": 0.0,
      "num_tokens": 12054761.0,
      "reward": 2.762500047683716,
      "reward_std": 1.2556325197219849,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.8125,
      "rewards/probe_completion_length/std": 0.3965577781200409,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.5,
      "rewards/probe_shaping_dominance/std": 0.5080004930496216,
      "rewards/probe_terminal_raw/mean": 0.5,
      "rewards/probe_terminal_raw/std": 0.5080004930496216,
      "rewards/rollout_reward_func/mean": 0.0,
      "rewards/rollout_reward_func/std": 1.0160009860992432,
      "sampling/importance_sampling_ratio/max": 1.1313704252243042,
      "sampling/importance_sampling_ratio/mean": 0.8322467803955078,
      "sampling/importance_sampling_ratio/min": 0.3396356999874115,
      "sampling/sampling_logp_difference/max": 0.9650154113769531,
      "sampling/sampling_logp_difference/mean": 0.10194703191518784,
      "step": 511,
      "step_time": 10.016052311998465
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.665514025837183,
      "epoch": 0.00512,
      "grad_norm": 0.00044581855763681233,
      "kl": 1.5684444457292557,
      "learning_rate": 7.99991611295216e-06,
      "loss": 0.0,
      "step": 512,
      "step_time": 6.577485599000283
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.40841441601514816,
      "epoch": 0.00513,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.0037170210853219032,
      "kl": 1.0080515779554844,
      "learning_rate": 7.999915760116986e-06,
      "loss": -0.0001,
      "num_tokens": 12104074.0,
      "reward": 3.700000047683716,
      "reward_std": 0.6720215082168579,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 2.0,
      "rewards/probe_completion_length/std": 0.0,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.125,
      "rewards/probe_shaping_dominance/std": 0.33601075410842896,
      "rewards/probe_terminal_raw/mean": 0.875,
      "rewards/probe_terminal_raw/std": 0.33601075410842896,
      "rewards/rollout_reward_func/mean": 0.75,
      "rewards/rollout_reward_func/std": 0.6720215082168579,
      "sampling/importance_sampling_ratio/max": 1.135704517364502,
      "sampling/importance_sampling_ratio/mean": 0.9181493520736694,
      "sampling/importance_sampling_ratio/min": 0.3490389287471771,
      "sampling/sampling_logp_difference/max": 0.7234532833099365,
      "sampling/sampling_logp_difference/mean": 0.05021464079618454,
      "step": 513,
      "step_time": 12.078954928000712
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.02083333395421505,
      "clip_ratio/low_min": 0.02083333395421505,
      "clip_ratio/region_mean": 0.02083333395421505,
      "entropy": 0.40340831875801086,
      "epoch": 0.00514,
      "grad_norm": 0.0032272571697831154,
      "kl": 1.0150629207491875,
      "learning_rate": 7.999915406541353e-06,
      "loss": -0.0001,
      "step": 514,
      "step_time": 7.781343302000096
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.02500000037252903,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02500000037252903,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.4827618896961212,
      "epoch": 0.00515,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.0024894806556403637,
      "kl": 1.4780778959393501,
      "learning_rate": 7.999915052225262e-06,
      "loss": -0.0,
      "num_tokens": 12162708.0,
      "reward": 3.075000047683716,
      "reward_std": 1.0998533964157104,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.40625,
      "rewards/probe_shaping_dominance/std": 0.49899089336395264,
      "rewards/probe_terminal_raw/mean": 0.59375,
      "rewards/probe_terminal_raw/std": 0.49899089336395264,
      "rewards/rollout_reward_func/mean": 0.1875,
      "rewards/rollout_reward_func/std": 0.9979817867279053,
      "sampling/importance_sampling_ratio/max": 1.5890978574752808,
      "sampling/importance_sampling_ratio/mean": 0.925313413143158,
      "sampling/importance_sampling_ratio/min": 0.0,
      "sampling/sampling_logp_difference/max": 1.23642897605896,
      "sampling/sampling_logp_difference/mean": 0.07910196483135223,
      "step": 515,
      "step_time": 13.655495725999572
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.02500000037252903,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02500000037252903,
      "entropy": 0.46802522242069244,
      "epoch": 0.00516,
      "grad_norm": 0.002520441310480237,
      "kl": 1.4883594401180744,
      "learning_rate": 7.999914697168712e-06,
      "loss": -0.0,
      "step": 516,
      "step_time": 8.033845836000182
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.96875,
      "completions/mean_terminated_length": 2.96875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.40684185177087784,
      "epoch": 0.00517,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.0035400697961449623,
      "kl": 1.5064535737037659,
      "learning_rate": 7.999914341371702e-06,
      "loss": -0.0001,
      "num_tokens": 12215507.0,
      "reward": 3.606250047683716,
      "reward_std": 0.8273305296897888,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.96875,
      "rewards/probe_completion_length/std": 0.1767766922712326,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.15625,
      "rewards/probe_shaping_dominance/std": 0.3689020276069641,
      "rewards/probe_terminal_raw/mean": 0.84375,
      "rewards/probe_terminal_raw/std": 0.3689020276069641,
      "rewards/rollout_reward_func/mean": 0.6875,
      "rewards/rollout_reward_func/std": 0.7378040552139282,
      "sampling/importance_sampling_ratio/max": 1.0795238018035889,
      "sampling/importance_sampling_ratio/mean": 0.8931475877761841,
      "sampling/importance_sampling_ratio/min": 0.0,
      "sampling/sampling_logp_difference/max": 1.2370891571044922,
      "sampling/sampling_logp_difference/mean": 0.054866012185811996,
      "step": 517,
      "step_time": 14.256954984998629
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.4029547944664955,
      "epoch": 0.00518,
      "grad_norm": 0.0032185025047510862,
      "kl": 1.5003392770886421,
      "learning_rate": 7.999913984834237e-06,
      "loss": -0.0001,
      "step": 518,
      "step_time": 8.012493073000769
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.9375,
      "completions/mean_terminated_length": 2.9375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.5593776553869247,
      "epoch": 0.00519,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.008156136609613895,
      "kl": 1.942481990903616,
      "learning_rate": 7.99991362755631e-06,
      "loss": -0.0002,
      "num_tokens": 12276740.0,
      "reward": 3.106250047683716,
      "reward_std": 1.050633430480957,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.9375,
      "rewards/probe_completion_length/std": 0.24593468010425568,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.40625,
      "rewards/probe_shaping_dominance/std": 0.49899089336395264,
      "rewards/probe_terminal_raw/mean": 0.59375,
      "rewards/probe_terminal_raw/std": 0.49899089336395264,
      "rewards/rollout_reward_func/mean": 0.1875,
      "rewards/rollout_reward_func/std": 0.9979817867279053,
      "sampling/importance_sampling_ratio/max": 2.931607961654663,
      "sampling/importance_sampling_ratio/mean": 0.8578198552131653,
      "sampling/importance_sampling_ratio/min": 0.10247347503900528,
      "sampling/sampling_logp_difference/max": 2.1812801361083984,
      "sampling/sampling_logp_difference/mean": 0.15371261537075043,
      "step": 519,
      "step_time": 13.328113587000189
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.03645833395421505,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.03645833395421505,
      "entropy": 0.5542311668395996,
      "epoch": 0.0052,
      "grad_norm": 0.008781365118920803,
      "kl": 3.49315713532269,
      "learning_rate": 7.999913269537927e-06,
      "loss": -0.0002,
      "step": 520,
      "step_time": 7.888086324001051
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 8.0,
      "completions/max_terminated_length": 8.0,
      "completions/mean_length": 3.15625,
      "completions/mean_terminated_length": 3.15625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.3672683946788311,
      "epoch": 0.00521,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.006466452963650227,
      "kl": 3.0226640701293945,
      "learning_rate": 7.999912910779086e-06,
      "loss": 0.0,
      "num_tokens": 12334694.0,
      "reward": 3.6999998092651367,
      "reward_std": 1.2443419694900513,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 2.21875,
      "rewards/probe_completion_length/std": 1.2374368906021118,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.25,
      "rewards/probe_shaping_dominance/std": 0.4399413466453552,
      "rewards/probe_terminal_raw/mean": 0.75,
      "rewards/probe_terminal_raw/std": 0.4399413466453552,
      "rewards/rollout_reward_func/mean": 0.5,
      "rewards/rollout_reward_func/std": 0.8798826932907104,
      "sampling/importance_sampling_ratio/max": 1.2187602519989014,
      "sampling/importance_sampling_ratio/mean": 0.9314761161804199,
      "sampling/importance_sampling_ratio/min": 0.20403584837913513,
      "sampling/sampling_logp_difference/max": 1.4795587062835693,
      "sampling/sampling_logp_difference/mean": 0.05452505126595497,
      "step": 521,
      "step_time": 13.404775265998978
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.36731596663594246,
      "epoch": 0.00522,
      "grad_norm": 0.003731419797986746,
      "kl": 2.32449933886528,
      "learning_rate": 7.999912551279787e-06,
      "loss": 0.0,
      "step": 522,
      "step_time": 7.9222454560003825
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.2391245774924755,
      "epoch": 0.00523,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.0008970812777988613,
      "kl": 2.0755921453237534,
      "learning_rate": 7.99991219104003e-06,
      "loss": 0.0001,
      "num_tokens": 12387223.0,
      "reward": 3.575000047683716,
      "reward_std": 0.7931155562400818,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 2.0,
      "rewards/probe_completion_length/std": 0.0,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.1875,
      "rewards/probe_shaping_dominance/std": 0.3965577781200409,
      "rewards/probe_terminal_raw/mean": 0.8125,
      "rewards/probe_terminal_raw/std": 0.3965577781200409,
      "rewards/rollout_reward_func/mean": 0.625,
      "rewards/rollout_reward_func/std": 0.7931155562400818,
      "sampling/importance_sampling_ratio/max": 1.8916287422180176,
      "sampling/importance_sampling_ratio/mean": 1.0045948028564453,
      "sampling/importance_sampling_ratio/min": 0.9068307280540466,
      "sampling/sampling_logp_difference/max": 0.6775636672973633,
      "sampling/sampling_logp_difference/mean": 0.026775330305099487,
      "step": 523,
      "step_time": 12.776815157999408
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.23557228222489357,
      "epoch": 0.00524,
      "grad_norm": 0.0009381213458254933,
      "kl": 2.076133668422699,
      "learning_rate": 7.999911830059816e-06,
      "loss": 0.0001,
      "step": 524,
      "step_time": 8.225136326999746
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.012500000186264515,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 8.0,
      "completions/max_terminated_length": 8.0,
      "completions/mean_length": 3.09375,
      "completions/mean_terminated_length": 3.09375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.4644627757370472,
      "epoch": 0.00525,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.005579262040555477,
      "kl": 2.1972502768039703,
      "learning_rate": 7.999911468339143e-06,
      "loss": 0.0001,
      "num_tokens": 12440743.0,
      "reward": 3.4812498092651367,
      "reward_std": 1.43649160861969,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 2.15625,
      "rewards/probe_completion_length/std": 1.2727763652801514,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.3125,
      "rewards/probe_shaping_dominance/std": 0.4709290862083435,
      "rewards/probe_terminal_raw/mean": 0.6875,
      "rewards/probe_terminal_raw/std": 0.4709290862083435,
      "rewards/rollout_reward_func/mean": 0.375,
      "rewards/rollout_reward_func/std": 0.941858172416687,
      "sampling/importance_sampling_ratio/max": 2.116163969039917,
      "sampling/importance_sampling_ratio/mean": 0.9502134323120117,
      "sampling/importance_sampling_ratio/min": 0.1773698925971985,
      "sampling/sampling_logp_difference/max": 1.583463191986084,
      "sampling/sampling_logp_difference/mean": 0.08440504968166351,
      "step": 525,
      "step_time": 12.90298909700141
    }
  ],
  "logging_steps": 1.0,
  "max_steps": 200000,
  "num_input_tokens_seen": 12440743,
  "num_train_epochs": 2,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}