{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.006,
  "eval_steps": 500,
  "global_step": 300,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.06598061177646741,
      "epoch": 2e-05,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.003923534415662289,
      "kl": 0.0,
      "learning_rate": 0.0,
      "loss": -0.0,
      "num_tokens": 102665.0,
      "reward": 2.355022430419922,
      "reward_std": 0.3552054464817047,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -0.501227617263794,
      "rewards/rollout_reward_func/std": 0.18640437722206116,
      "sampling/importance_sampling_ratio/max": 1.0961512327194214,
      "sampling/importance_sampling_ratio/mean": 0.9703092575073242,
      "sampling/importance_sampling_ratio/min": 0.5060414671897888,
      "sampling/sampling_logp_difference/max": 0.6756159067153931,
      "sampling/sampling_logp_difference/mean": 0.0183907151222229,
      "step": 1,
      "step_time": 29.075429260999726
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.06598061177646741,
      "epoch": 4e-05,
      "grad_norm": 0.003917683847248554,
      "kl": 0.0,
      "learning_rate": 2.2857142857142855e-07,
      "loss": -0.0,
      "step": 2,
      "step_time": 11.468670933999988
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.05838002988048174,
      "epoch": 6e-05,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.004170146305114031,
      "kl": 0.0014184596652553338,
      "learning_rate": 4.571428571428571e-07,
      "loss": 0.0,
      "num_tokens": 205842.0,
      "reward": 2.2323365211486816,
      "reward_std": 0.41563019156455994,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.970565140247345,
      "rewards/probe_shaping_dominance/std": 0.11582481861114502,
      "rewards/probe_terminal_raw/mean": 0.03125,
      "rewards/probe_terminal_raw/std": 0.12296734005212784,
      "rewards/rollout_reward_func/mean": -0.5944784879684448,
      "rewards/rollout_reward_func/std": 0.19796565175056458,
      "sampling/importance_sampling_ratio/max": 1.4160258769989014,
      "sampling/importance_sampling_ratio/mean": 1.0286931991577148,
      "sampling/importance_sampling_ratio/min": 0.8523033857345581,
      "sampling/sampling_logp_difference/max": 0.34715062379837036,
      "sampling/sampling_logp_difference/mean": 0.01565416157245636,
      "step": 3,
      "step_time": 26.976024578999954
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.02083333395421505,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "entropy": 0.06273448248884961,
      "epoch": 8e-05,
      "grad_norm": 0.0025308942422270775,
      "kl": 0.004324701569430545,
      "learning_rate": 6.857142857142857e-07,
      "loss": 0.0,
      "step": 4,
      "step_time": 12.765090235000116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.78125,
      "completions/mean_terminated_length": 2.78125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.06146455561975017,
      "epoch": 0.0001,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.010321667417883873,
      "kl": 0.005618094519680539,
      "learning_rate": 9.142857142857142e-07,
      "loss": 0.0,
      "num_tokens": 303571.0,
      "reward": 2.236471176147461,
      "reward_std": 0.5468828678131104,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.78125,
      "rewards/probe_completion_length/std": 0.420013427734375,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.9439389705657959,
      "rewards/probe_shaping_dominance/std": 0.15084654092788696,
      "rewards/probe_terminal_raw/mean": 0.0625,
      "rewards/probe_terminal_raw/std": 0.16800537705421448,
      "rewards/rollout_reward_func/mean": -0.5324676036834717,
      "rewards/rollout_reward_func/std": 0.24024422466754913,
      "sampling/importance_sampling_ratio/max": 1.3134887218475342,
      "sampling/importance_sampling_ratio/mean": 0.9676171541213989,
      "sampling/importance_sampling_ratio/min": 0.41273218393325806,
      "sampling/sampling_logp_difference/max": 0.8849565982818604,
      "sampling/sampling_logp_difference/mean": 0.026659058406949043,
      "step": 5,
      "step_time": 26.660728665999955
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.012500000186264515,
      "entropy": 0.06810211515403353,
      "epoch": 0.00012,
      "grad_norm": 0.007714552339166403,
      "kl": 0.0028154569756466685,
      "learning_rate": 1.1428571428571428e-06,
      "loss": 0.0,
      "step": 6,
      "step_time": 11.44768754599977
    },
    {
      "clip_ratio/high_max": 0.06250000186264515,
      "clip_ratio/high_mean": 0.031250000931322575,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.031250000931322575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.04115386162811774,
      "epoch": 0.00014,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.003323962911963463,
      "kl": 0.001360555283525855,
      "learning_rate": 1.3714285714285715e-06,
      "loss": 0.0,
      "num_tokens": 410424.0,
      "reward": 2.2917943000793457,
      "reward_std": 0.44559940695762634,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9733562469482422,
      "rewards/probe_shaping_dominance/std": 0.10957542806863785,
      "rewards/probe_terminal_raw/mean": 0.027566056698560715,
      "rewards/probe_terminal_raw/std": 0.10949952900409698,
      "rewards/rollout_reward_func/mean": -0.5341278314590454,
      "rewards/rollout_reward_func/std": 0.27136242389678955,
      "sampling/importance_sampling_ratio/max": 1.0618572235107422,
      "sampling/importance_sampling_ratio/mean": 0.9585317969322205,
      "sampling/importance_sampling_ratio/min": 0.2324376255273819,
      "sampling/sampling_logp_difference/max": 1.470571756362915,
      "sampling/sampling_logp_difference/mean": 0.02589060366153717,
      "step": 7,
      "step_time": 27.5007078120002
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.04836703218097682,
      "epoch": 0.00016,
      "grad_norm": 0.005415800027549267,
      "kl": 0.001694043724171479,
      "learning_rate": 1.6e-06,
      "loss": 0.0,
      "step": 8,
      "step_time": 12.223170772000117
    },
    {
      "clip_ratio/high_max": 0.06250000186264515,
      "clip_ratio/high_mean": 0.031250000931322575,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.031250000931322575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.07847535189284827,
      "epoch": 0.00018,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.010716816410422325,
      "kl": 0.004078912243130617,
      "learning_rate": 1.8285714285714284e-06,
      "loss": -0.0,
      "num_tokens": 511562.0,
      "reward": 2.3355042934417725,
      "reward_std": 0.43706634640693665,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.9753304123878479,
      "rewards/probe_shaping_dominance/std": 0.0984005331993103,
      "rewards/probe_terminal_raw/mean": 0.026295732706785202,
      "rewards/probe_terminal_raw/std": 0.10541322082281113,
      "rewards/rollout_reward_func/mean": -0.553621768951416,
      "rewards/rollout_reward_func/std": 0.20992274582386017,
      "sampling/importance_sampling_ratio/max": 2.0806119441986084,
      "sampling/importance_sampling_ratio/mean": 1.0222396850585938,
      "sampling/importance_sampling_ratio/min": 0.5085986256599426,
      "sampling/sampling_logp_difference/max": 0.7373225688934326,
      "sampling/sampling_logp_difference/mean": 0.028744252398610115,
      "step": 9,
      "step_time": 26.567318749999913
    },
    {
      "clip_ratio/high_max": 0.0416666679084301,
      "clip_ratio/high_mean": 0.02083333395421505,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.031250000931322575,
      "entropy": 0.07049791459576227,
      "epoch": 0.0002,
      "grad_norm": 0.004469083622097969,
      "kl": 0.026501665124972873,
      "learning_rate": 2.057142857142857e-06,
      "loss": -0.0,
      "step": 10,
      "step_time": 11.64468052799998
    },
    {
      "clip_ratio/high_max": 0.02500000037252903,
      "clip_ratio/high_mean": 0.012500000186264515,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.012500000186264515,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.9375,
      "completions/mean_terminated_length": 2.9375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.05360435344118741,
      "epoch": 0.00022,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.008809314109385014,
      "kl": 0.004907062985087585,
      "learning_rate": 2.2857142857142856e-06,
      "loss": -0.0,
      "num_tokens": 616201.0,
      "reward": 2.4397201538085938,
      "reward_std": 0.5087255239486694,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.9375,
      "rewards/probe_completion_length/std": 0.24593468010425568,
      "rewards/probe_invalid_count/mean": 0.0625,
      "rewards/probe_invalid_count/std": 0.24593468010425568,
      "rewards/probe_shaping_dominance/mean": 0.9706827998161316,
      "rewards/probe_shaping_dominance/std": 0.12301044911146164,
      "rewards/probe_terminal_raw/mean": 0.025406504049897194,
      "rewards/probe_terminal_raw/std": 0.10275532305240631,
      "rewards/rollout_reward_func/mean": -0.5063689351081848,
      "rewards/rollout_reward_func/std": 0.27631497383117676,
      "sampling/importance_sampling_ratio/max": 1.1329089403152466,
      "sampling/importance_sampling_ratio/mean": 0.9933090806007385,
      "sampling/importance_sampling_ratio/min": 0.768523633480072,
      "sampling/sampling_logp_difference/max": 0.2632848620414734,
      "sampling/sampling_logp_difference/mean": 0.007937189191579819,
      "step": 11,
      "step_time": 27.777191968999887
    },
    {
      "clip_ratio/high_max": 0.02500000037252903,
      "clip_ratio/high_mean": 0.012500000186264515,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02291666716337204,
      "entropy": 0.056653952024134924,
      "epoch": 0.00024,
      "grad_norm": 0.005528156645596027,
      "kl": 0.0032436020156101364,
      "learning_rate": 2.5142857142857142e-06,
      "loss": -0.0,
      "step": 12,
      "step_time": 11.55833436900025
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.038703071273630485,
      "epoch": 0.00026,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.007530162110924721,
      "kl": 0.09287417630221206,
      "learning_rate": 2.742857142857143e-06,
      "loss": -0.0,
      "num_tokens": 724364.0,
      "reward": 2.351245880126953,
      "reward_std": 0.4424680173397064,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9703531265258789,
      "rewards/probe_shaping_dominance/std": 0.11669508367776871,
      "rewards/probe_terminal_raw/mean": 0.03125,
      "rewards/probe_terminal_raw/std": 0.12296734005212784,
      "rewards/rollout_reward_func/mean": -0.47535717487335205,
      "rewards/rollout_reward_func/std": 0.24601998925209045,
      "sampling/importance_sampling_ratio/max": 1.440869688987732,
      "sampling/importance_sampling_ratio/mean": 1.0093717575073242,
      "sampling/importance_sampling_ratio/min": 0.7920892238616943,
      "sampling/sampling_logp_difference/max": 0.3652459681034088,
      "sampling/sampling_logp_difference/mean": 0.008522224612534046,
      "step": 13,
      "step_time": 27.311093626999764
    },
    {
      "clip_ratio/high_max": 0.0416666679084301,
      "clip_ratio/high_mean": 0.02083333395421505,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "entropy": 0.0453864433511626,
      "epoch": 0.00028,
      "grad_norm": 0.006435270421206951,
      "kl": 0.010504724175871893,
      "learning_rate": 2.9714285714285716e-06,
      "loss": -0.0,
      "step": 14,
      "step_time": 11.88281524700028
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.9375,
      "completions/mean_terminated_length": 2.9375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.037373697148723295,
      "epoch": 0.0003,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.006931572686880827,
      "kl": 0.0013499163329698805,
      "learning_rate": 3.2e-06,
      "loss": -0.0,
      "num_tokens": 828160.0,
      "reward": 2.3457703590393066,
      "reward_std": 0.32655069231987,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.9375,
      "rewards/probe_completion_length/std": 0.24593468010425568,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9890751838684082,
      "rewards/probe_shaping_dominance/std": 0.06179998442530632,
      "rewards/probe_terminal_raw/mean": 0.01092479657381773,
      "rewards/probe_terminal_raw/std": 0.06179998070001602,
      "rewards/rollout_reward_func/mean": -0.5417294502258301,
      "rewards/rollout_reward_func/std": 0.19428227841854095,
      "sampling/importance_sampling_ratio/max": 1.5512616634368896,
      "sampling/importance_sampling_ratio/mean": 1.0071200132369995,
      "sampling/importance_sampling_ratio/min": 0.7788013219833374,
      "sampling/sampling_logp_difference/max": 0.43915224075317383,
      "sampling/sampling_logp_difference/mean": 0.008885648101568222,
      "step": 15,
      "step_time": 27.3166221219999
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.03605123230954632,
      "epoch": 0.00032,
      "grad_norm": 0.007162998430430889,
      "kl": 0.0005329122045578671,
      "learning_rate": 3.428571428571428e-06,
      "loss": -0.0,
      "step": 16,
      "step_time": 12.10146868100037
    },
    {
      "clip_ratio/high_max": 0.02500000037252903,
      "clip_ratio/high_mean": 0.012500000186264515,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02291666716337204,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.84375,
      "completions/mean_terminated_length": 2.84375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.04818721191259101,
      "epoch": 0.00034,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0020011591259390116,
      "kl": 0.000880227197208705,
      "learning_rate": 3.657142857142857e-06,
      "loss": 0.0,
      "num_tokens": 933852.0,
      "reward": 2.2396738529205322,
      "reward_std": 0.3769412934780121,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.84375,
      "rewards/probe_completion_length/std": 0.3689020276069641,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9811877012252808,
      "rewards/probe_shaping_dominance/std": 0.07417813688516617,
      "rewards/probe_terminal_raw/mean": 0.019435975700616837,
      "rewards/probe_terminal_raw/std": 0.07648143172264099,
      "rewards/rollout_reward_func/mean": -0.554699718952179,
      "rewards/rollout_reward_func/std": 0.14253978431224823,
      "sampling/importance_sampling_ratio/max": 1.3911144733428955,
      "sampling/importance_sampling_ratio/mean": 1.0014019012451172,
      "sampling/importance_sampling_ratio/min": 0.647373378276825,
      "sampling/sampling_logp_difference/max": 0.4348297119140625,
      "sampling/sampling_logp_difference/mean": 0.01693439856171608,
      "step": 17,
      "step_time": 27.466287271999818
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.02083333395421505,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "entropy": 0.059825868549523875,
      "epoch": 0.00036,
      "grad_norm": 0.00400462094694376,
      "kl": 0.0010442571770683529,
      "learning_rate": 3.885714285714286e-06,
      "loss": 0.0,
      "step": 18,
      "step_time": 11.729434232999665
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.02083333395421505,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.9375,
      "completions/mean_terminated_length": 2.9375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.07400128486915492,
      "epoch": 0.00038,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.004407655913382769,
      "kl": 0.0058712156430829054,
      "learning_rate": 4.114285714285714e-06,
      "loss": -0.0,
      "num_tokens": 1040669.0,
      "reward": 2.3979897499084473,
      "reward_std": 0.3378089666366577,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.9375,
      "rewards/probe_completion_length/std": 0.24593468010425568,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9721384644508362,
      "rewards/probe_shaping_dominance/std": 0.10964522510766983,
      "rewards/probe_terminal_raw/mean": 0.03125,
      "rewards/probe_terminal_raw/std": 0.12296734005212784,
      "rewards/rollout_reward_func/mean": -0.4928986430168152,
      "rewards/rollout_reward_func/std": 0.280559241771698,
      "sampling/importance_sampling_ratio/max": 1.2489417791366577,
      "sampling/importance_sampling_ratio/mean": 0.9779143333435059,
      "sampling/importance_sampling_ratio/min": 0.5380392670631409,
      "sampling/sampling_logp_difference/max": 0.619827151298523,
      "sampling/sampling_logp_difference/mean": 0.017949596047401428,
      "step": 19,
      "step_time": 28.172511434999933
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.07153313838352915,
      "epoch": 0.0004,
      "grad_norm": 0.010058136656880379,
      "kl": 0.01704683385832595,
      "learning_rate": 4.342857142857142e-06,
      "loss": -0.0,
      "step": 20,
      "step_time": 11.798744088000149
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.78125,
      "completions/mean_terminated_length": 2.78125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.07958520320244133,
      "epoch": 0.00042,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.015031801536679268,
      "kl": 0.02134023218428638,
      "learning_rate": 4.571428571428571e-06,
      "loss": 0.0,
      "num_tokens": 1146440.0,
      "reward": 2.2259719371795654,
      "reward_std": 0.4264923334121704,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.78125,
      "rewards/probe_completion_length/std": 0.420013427734375,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9849475622177124,
      "rewards/probe_shaping_dominance/std": 0.08514932543039322,
      "rewards/probe_terminal_raw/mean": 0.015625,
      "rewards/probe_terminal_raw/std": 0.0883883461356163,
      "rewards/rollout_reward_func/mean": -0.505850613117218,
      "rewards/rollout_reward_func/std": 0.22946372628211975,
      "sampling/importance_sampling_ratio/max": 1.8730424642562866,
      "sampling/importance_sampling_ratio/mean": 1.0450382232666016,
      "sampling/importance_sampling_ratio/min": 0.6261028051376343,
      "sampling/sampling_logp_difference/max": 0.6275629997253418,
      "sampling/sampling_logp_difference/mean": 0.033233314752578735,
      "step": 21,
      "step_time": 27.33938806800029
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.08325157128274441,
      "epoch": 0.00044,
      "grad_norm": 0.01334489043802023,
      "kl": 0.01684667149083907,
      "learning_rate": 4.8e-06,
      "loss": 0.0,
      "step": 22,
      "step_time": 11.840854454999999
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.8125,
      "completions/mean_terminated_length": 2.8125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.06853798200609162,
      "epoch": 0.00046,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.01755128987133503,
      "kl": 0.003467819899402258,
      "learning_rate": 5.0285714285714285e-06,
      "loss": 0.0001,
      "num_tokens": 1248638.0,
      "reward": 2.270667552947998,
      "reward_std": 0.47502174973487854,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.8125,
      "rewards/probe_completion_length/std": 0.3965577781200409,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.978266716003418,
      "rewards/probe_shaping_dominance/std": 0.08902076631784439,
      "rewards/probe_terminal_raw/mean": 0.025406504049897194,
      "rewards/probe_terminal_raw/std": 0.10275533050298691,
      "rewards/rollout_reward_func/mean": -0.495505690574646,
      "rewards/rollout_reward_func/std": 0.24283160269260406,
      "sampling/importance_sampling_ratio/max": 2.039003610610962,
      "sampling/importance_sampling_ratio/mean": 1.0263185501098633,
      "sampling/importance_sampling_ratio/min": 0.6725395321846008,
      "sampling/sampling_logp_difference/max": 0.8136651515960693,
      "sampling/sampling_logp_difference/mean": 0.02945869043469429,
      "step": 23,
      "step_time": 27.97098964299971
    },
    {
      "clip_ratio/high_max": 0.0416666679084301,
      "clip_ratio/high_mean": 0.02083333395421505,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "entropy": 0.0757814844546374,
      "epoch": 0.00048,
      "grad_norm": 0.02817094884812832,
      "kl": 0.009625433100154623,
      "learning_rate": 5.257142857142857e-06,
      "loss": 0.0001,
      "step": 24,
      "step_time": 11.866423993000353
    },
    {
      "clip_ratio/high_max": 0.04583333432674408,
      "clip_ratio/high_mean": 0.02291666716337204,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02291666716337204,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.046443949002423324,
      "epoch": 0.0005,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.002482979791238904,
      "kl": 0.011937914369631542,
      "learning_rate": 5.485714285714286e-06,
      "loss": -0.0,
      "num_tokens": 1348967.0,
      "reward": 2.4115562438964844,
      "reward_std": 0.4029836654663086,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.987568199634552,
      "rewards/probe_shaping_dominance/std": 0.07032480090856552,
      "rewards/probe_terminal_raw/mean": 0.011941056698560715,
      "rewards/probe_terminal_raw/std": 0.06754881888628006,
      "rewards/rollout_reward_func/mean": -0.4754529595375061,
      "rewards/rollout_reward_func/std": 0.20119507610797882,
      "sampling/importance_sampling_ratio/max": 1.2200837135314941,
      "sampling/importance_sampling_ratio/mean": 0.9975783824920654,
      "sampling/importance_sampling_ratio/min": 0.8279879689216614,
      "sampling/sampling_logp_difference/max": 0.1989191770553589,
      "sampling/sampling_logp_difference/mean": 0.011062754318118095,
      "step": 25,
      "step_time": 26.57660025700011
    },
    {
      "clip_ratio/high_max": 0.02500000037252903,
      "clip_ratio/high_mean": 0.012500000186264515,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.012500000186264515,
      "entropy": 0.048878253597649746,
      "epoch": 0.00052,
      "grad_norm": 0.009242719039320946,
      "kl": 0.008345632606265863,
      "learning_rate": 5.7142857142857145e-06,
      "loss": -0.0,
      "step": 26,
      "step_time": 11.446816336000438
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.84375,
      "completions/mean_terminated_length": 2.84375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.03407000357401557,
      "epoch": 0.00054,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0032159958500415087,
      "kl": 0.0009551170151098631,
      "learning_rate": 5.942857142857143e-06,
      "loss": 0.0001,
      "num_tokens": 1454840.0,
      "reward": 2.308957099914551,
      "reward_std": 0.35809147357940674,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.84375,
      "rewards/probe_completion_length/std": 0.3689020276069641,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9919047951698303,
      "rewards/probe_shaping_dominance/std": 0.04579342529177666,
      "rewards/probe_terminal_raw/mean": 0.00825711339712143,
      "rewards/probe_terminal_raw/std": 0.04670928418636322,
      "rewards/rollout_reward_func/mean": -0.4849545955657959,
      "rewards/rollout_reward_func/std": 0.17723596096038818,
      "sampling/importance_sampling_ratio/max": 1.3277825117111206,
      "sampling/importance_sampling_ratio/mean": 1.03197181224823,
      "sampling/importance_sampling_ratio/min": 0.9784432053565979,
      "sampling/sampling_logp_difference/max": 0.2835111618041992,
      "sampling/sampling_logp_difference/mean": 0.010589659214019775,
      "step": 27,
      "step_time": 27.828797529999974
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.031250000931322575,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.031250000931322575,
      "entropy": 0.03843506844714284,
      "epoch": 0.00056,
      "grad_norm": 0.001164909452199936,
      "kl": 0.0005121690442896343,
      "learning_rate": 6.171428571428571e-06,
      "loss": 0.0001,
      "step": 28,
      "step_time": 11.809285704000104
    },
    {
      "clip_ratio/high_max": 0.0416666679084301,
      "clip_ratio/high_mean": 0.02083333395421505,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.033333334140479565,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.04930314904777333,
      "epoch": 0.00058,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.002748744795098901,
      "kl": 0.004907883932952495,
      "learning_rate": 6.4e-06,
      "loss": -0.0,
      "num_tokens": 1556979.0,
      "reward": 2.240399122238159,
      "reward_std": 0.4602973461151123,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9860905408859253,
      "rewards/probe_shaping_dominance/std": 0.0786839947104454,
      "rewards/probe_terminal_raw/mean": 0.015625,
      "rewards/probe_terminal_raw/std": 0.0883883461356163,
      "rewards/rollout_reward_func/mean": -0.5863164663314819,
      "rewards/rollout_reward_func/std": 0.2140309065580368,
      "sampling/importance_sampling_ratio/max": 1.2453359365463257,
      "sampling/importance_sampling_ratio/mean": 0.9654719233512878,
      "sampling/importance_sampling_ratio/min": 0.4166664183139801,
      "sampling/sampling_logp_difference/max": 0.8754727840423584,
      "sampling/sampling_logp_difference/mean": 0.023819994181394577,
      "step": 29,
      "step_time": 26.907328863000203
    },
    {
      "clip_ratio/high_max": 0.0416666679084301,
      "clip_ratio/high_mean": 0.02083333395421505,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.031250000931322575,
      "entropy": 0.04787830199347809,
      "epoch": 0.0006,
      "grad_norm": 0.004575630649924278,
      "kl": 0.021033072499267114,
      "learning_rate": 6.628571428571428e-06,
      "loss": -0.0,
      "step": 30,
      "step_time": 12.03838489500049
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.84375,
      "completions/mean_terminated_length": 2.84375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.06083334801951423,
      "epoch": 0.00062,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.013260968960821629,
      "kl": 0.0185297402889546,
      "learning_rate": 6.857142857142856e-06,
      "loss": 0.0001,
      "num_tokens": 1662740.0,
      "reward": 2.1973555088043213,
      "reward_std": 0.43850135803222656,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.84375,
      "rewards/probe_completion_length/std": 0.3689020276069641,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.991719663143158,
      "rewards/probe_shaping_dominance/std": 0.046840641647577286,
      "rewards/probe_terminal_raw/mean": 0.008384146727621555,
      "rewards/probe_terminal_raw/std": 0.04742789641022682,
      "rewards/rollout_reward_func/mean": -0.5964983701705933,
      "rewards/rollout_reward_func/std": 0.296856164932251,
      "sampling/importance_sampling_ratio/max": 2.8883938789367676,
      "sampling/importance_sampling_ratio/mean": 1.041499376296997,
      "sampling/importance_sampling_ratio/min": 0.611585795879364,
      "sampling/sampling_logp_difference/max": 0.9767682552337646,
      "sampling/sampling_logp_difference/mean": 0.02332986891269684,
      "step": 31,
      "step_time": 27.415389096000126
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.03750000149011612,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.03750000149011612,
      "entropy": 0.06045454426202923,
      "epoch": 0.00064,
      "grad_norm": 0.014426084235310555,
      "kl": 0.027800074360129656,
      "learning_rate": 7.085714285714285e-06,
      "loss": 0.0001,
      "step": 32,
      "step_time": 11.844893076999824
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.0422610079695005,
      "epoch": 0.00066,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.005128172226250172,
      "kl": 0.009348716392499568,
      "learning_rate": 7.314285714285714e-06,
      "loss": 0.0,
      "num_tokens": 1765521.0,
      "reward": 2.3525331020355225,
      "reward_std": 0.3403870165348053,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.9837720394134521,
      "rewards/probe_shaping_dominance/std": 0.06571495532989502,
      "rewards/probe_terminal_raw/mean": 0.01880081370472908,
      "rewards/probe_terminal_raw/std": 0.0745616927742958,
      "rewards/rollout_reward_func/mean": -0.5375398397445679,
      "rewards/rollout_reward_func/std": 0.22309184074401855,
      "sampling/importance_sampling_ratio/max": 1.275700569152832,
      "sampling/importance_sampling_ratio/mean": 0.994273841381073,
      "sampling/importance_sampling_ratio/min": 0.600629448890686,
      "sampling/sampling_logp_difference/max": 0.5097755193710327,
      "sampling/sampling_logp_difference/mean": 0.011872323229908943,
      "step": 33,
      "step_time": 27.277823512999475
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.012500000186264515,
      "entropy": 0.0630603444587905,
      "epoch": 0.00068,
      "grad_norm": 0.007451063022017479,
      "kl": 0.007260499390742581,
      "learning_rate": 7.542857142857142e-06,
      "loss": 0.0,
      "step": 34,
      "step_time": 12.15706381699988
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4.0,
      "completions/max_terminated_length": 4.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.06165817377041094,
      "epoch": 0.0007,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.01730046235024929,
      "kl": 0.007911830088153327,
      "learning_rate": 7.771428571428572e-06,
      "loss": 0.0,
      "num_tokens": 1868519.0,
      "reward": 2.275172233581543,
      "reward_std": 0.48706814646720886,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.42121174931526184,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9609175324440002,
      "rewards/probe_shaping_dominance/std": 0.12447085976600647,
      "rewards/probe_terminal_raw/mean": 0.042174797505140305,
      "rewards/probe_terminal_raw/std": 0.13503843545913696,
      "rewards/rollout_reward_func/mean": -0.552919864654541,
      "rewards/rollout_reward_func/std": 0.20079734921455383,
      "sampling/importance_sampling_ratio/max": 2.4695143699645996,
      "sampling/importance_sampling_ratio/mean": 1.0170851945877075,
      "sampling/importance_sampling_ratio/min": 0.5358201861381531,
      "sampling/sampling_logp_difference/max": 0.9040230512619019,
      "sampling/sampling_logp_difference/mean": 0.023447973653674126,
      "step": 35,
      "step_time": 26.740296546999843
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.02291666716337204,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02291666716337204,
      "entropy": 0.058829510177019984,
      "epoch": 0.00072,
      "grad_norm": 0.0026921494863927364,
      "kl": 0.008077224918185522,
      "learning_rate": 8e-06,
      "loss": 0.0,
      "step": 36,
      "step_time": 11.526741372999822
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.78125,
      "completions/mean_terminated_length": 2.78125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.0314667156167161,
      "epoch": 0.00074,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0024028760381042957,
      "kl": 0.00625098004627489,
      "learning_rate": 7.999999998518522e-06,
      "loss": -0.0,
      "num_tokens": 1970124.0,
      "reward": 2.264838933944702,
      "reward_std": 0.5270799994468689,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.78125,
      "rewards/probe_completion_length/std": 0.420013427734375,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9424034953117371,
      "rewards/probe_shaping_dominance/std": 0.13680323958396912,
      "rewards/probe_terminal_raw/mean": 0.05856199190020561,
      "rewards/probe_terminal_raw/std": 0.1405627578496933,
      "rewards/rollout_reward_func/mean": -0.4673765003681183,
      "rewards/rollout_reward_func/std": 0.2097388207912445,
      "sampling/importance_sampling_ratio/max": 1.8680520057678223,
      "sampling/importance_sampling_ratio/mean": 1.0426936149597168,
      "sampling/importance_sampling_ratio/min": 0.9883837103843689,
      "sampling/sampling_logp_difference/max": 0.6248946189880371,
      "sampling/sampling_logp_difference/mean": 0.012692131102085114,
      "step": 37,
      "step_time": 26.3523716899997
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.0313223133775864,
      "epoch": 0.00076,
      "grad_norm": 0.0023324843496084213,
      "kl": 0.0035868614445746516,
      "learning_rate": 7.99999999407409e-06,
      "loss": -0.0,
      "step": 38,
      "step_time": 12.628685679
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.9375,
      "completions/mean_terminated_length": 2.9375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.05816701124422252,
      "epoch": 0.00078,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.007494654040783644,
      "kl": 0.03421914212867705,
      "learning_rate": 7.999999986666703e-06,
      "loss": -0.0,
      "num_tokens": 2076598.0,
      "reward": 2.311230182647705,
      "reward_std": 0.36618658900260925,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.9375,
      "rewards/probe_completion_length/std": 0.24593468010425568,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.9932495951652527,
      "rewards/probe_shaping_dominance/std": 0.03818599134683609,
      "rewards/probe_terminal_raw/mean": 0.00889227632433176,
      "rewards/probe_terminal_raw/std": 0.05030231550335884,
      "rewards/rollout_reward_func/mean": -0.6096617579460144,
      "rewards/rollout_reward_func/std": 0.20722205936908722,
      "sampling/importance_sampling_ratio/max": 1.4155004024505615,
      "sampling/importance_sampling_ratio/mean": 0.9876462817192078,
      "sampling/importance_sampling_ratio/min": 0.7839126586914062,
      "sampling/sampling_logp_difference/max": 0.3471514582633972,
      "sampling/sampling_logp_difference/mean": 0.0168665312230587,
      "step": 39,
      "step_time": 26.542540336999764
    },
    {
      "clip_ratio/high_max": 0.0416666679084301,
      "clip_ratio/high_mean": 0.02083333395421505,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "entropy": 0.057803097704891115,
      "epoch": 0.0008,
      "grad_norm": 0.004047502297908068,
      "kl": 0.02604524488651805,
      "learning_rate": 7.99999997629636e-06,
      "loss": -0.0,
      "step": 40,
      "step_time": 11.67055183600064
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.030615816707722843,
      "epoch": 0.00082,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.002531230915337801,
      "kl": 0.0002023791248291218,
      "learning_rate": 7.999999962963062e-06,
      "loss": 0.0,
      "num_tokens": 2182025.0,
      "reward": 2.3659095764160156,
      "reward_std": 0.3363305926322937,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.970511794090271,
      "rewards/probe_shaping_dominance/std": 0.11608950048685074,
      "rewards/probe_terminal_raw/mean": 0.03125,
      "rewards/probe_terminal_raw/std": 0.12296734005212784,
      "rewards/rollout_reward_func/mean": -0.49210208654403687,
      "rewards/rollout_reward_func/std": 0.19491301476955414,
      "sampling/importance_sampling_ratio/max": 1.0795150995254517,
      "sampling/importance_sampling_ratio/mean": 1.0009956359863281,
      "sampling/importance_sampling_ratio/min": 0.9117990136146545,
      "sampling/sampling_logp_difference/max": 0.09234827756881714,
      "sampling/sampling_logp_difference/mean": 0.004786844830960035,
      "step": 41,
      "step_time": 26.691086626000242
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.029701224237214774,
      "epoch": 0.00084,
      "grad_norm": 0.0024189443793147802,
      "kl": 0.0002964178702313802,
      "learning_rate": 7.999999946666809e-06,
      "loss": 0.0,
      "step": 42,
      "step_time": 12.699684607000108
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4.0,
      "completions/max_terminated_length": 4.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.05056236406426251,
      "epoch": 0.00086,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.01209098007529974,
      "kl": 0.010471812368450628,
      "learning_rate": 7.999999927407602e-06,
      "loss": -0.0,
      "num_tokens": 2286142.0,
      "reward": 2.469311237335205,
      "reward_std": 0.4115804135799408,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 2.0,
      "rewards/probe_completion_length/std": 0.2540002465248108,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9852168560028076,
      "rewards/probe_shaping_dominance/std": 0.0836259201169014,
      "rewards/probe_terminal_raw/mean": 0.015625,
      "rewards/probe_terminal_raw/std": 0.0883883461356163,
      "rewards/rollout_reward_func/mean": -0.48153066635131836,
      "rewards/rollout_reward_func/std": 0.24715669453144073,
      "sampling/importance_sampling_ratio/max": 2.0913164615631104,
      "sampling/importance_sampling_ratio/mean": 1.0417256355285645,
      "sampling/importance_sampling_ratio/min": 0.8711547255516052,
      "sampling/sampling_logp_difference/max": 0.7377924919128418,
      "sampling/sampling_logp_difference/mean": 0.016645925119519234,
      "step": 43,
      "step_time": 26.97034319699992
    },
    {
      "clip_ratio/high_max": 0.02500000037252903,
      "clip_ratio/high_mean": 0.012500000186264515,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02291666716337204,
      "entropy": 0.06137717463207082,
      "epoch": 0.00088,
      "grad_norm": 0.004214904736727476,
      "kl": 0.02022934940032428,
      "learning_rate": 7.99999990518544e-06,
      "loss": -0.0,
      "step": 44,
      "step_time": 11.70073124500027
    },
    {
      "clip_ratio/high_max": 0.02500000037252903,
      "clip_ratio/high_mean": 0.012500000186264515,
      "clip_ratio/low_mean": 0.015625,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.028125000186264515,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.78125,
      "completions/mean_terminated_length": 2.78125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.0973230431554839,
      "epoch": 0.0009,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.008132712915539742,
      "kl": 0.012426901788174405,
      "learning_rate": 7.999999880000322e-06,
      "loss": 0.0,
      "num_tokens": 2390804.0,
      "reward": 2.2431583404541016,
      "reward_std": 0.5248546600341797,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.78125,
      "rewards/probe_completion_length/std": 0.420013427734375,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9669345021247864,
      "rewards/probe_shaping_dominance/std": 0.10819793492555618,
      "rewards/probe_terminal_raw/mean": 0.038998983800411224,
      "rewards/probe_terminal_raw/std": 0.1286177635192871,
      "rewards/rollout_reward_func/mean": -0.4940252900123596,
      "rewards/rollout_reward_func/std": 0.255024790763855,
      "sampling/importance_sampling_ratio/max": 1.6163866519927979,
      "sampling/importance_sampling_ratio/mean": 0.9977768659591675,
      "sampling/importance_sampling_ratio/min": 0.3879617154598236,
      "sampling/sampling_logp_difference/max": 0.9467527270317078,
      "sampling/sampling_logp_difference/mean": 0.02932477556169033,
      "step": 45,
      "step_time": 26.472695325999894
    },
    {
      "clip_ratio/high_max": 0.02500000037252903,
      "clip_ratio/high_mean": 0.012500000186264515,
      "clip_ratio/low_mean": 0.03125,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.043750000186264515,
      "entropy": 0.09132259455509484,
      "epoch": 0.00092,
      "grad_norm": 0.004103749990463257,
      "kl": 0.02156046110090415,
      "learning_rate": 7.99999985185225e-06,
      "loss": 0.0,
      "step": 46,
      "step_time": 12.17020241299997
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02291666716337204,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.08302483463194221,
      "epoch": 0.00094,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.00962382648140192,
      "kl": 0.05296483388110573,
      "learning_rate": 7.999999820741223e-06,
      "loss": 0.0,
      "num_tokens": 2498950.0,
      "reward": 2.3484296798706055,
      "reward_std": 0.40232396125793457,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9905115365982056,
      "rewards/probe_shaping_dominance/std": 0.05367483198642731,
      "rewards/probe_terminal_raw/mean": 0.009019308723509312,
      "rewards/probe_terminal_raw/std": 0.05102091282606125,
      "rewards/rollout_reward_func/mean": -0.507351279258728,
      "rewards/rollout_reward_func/std": 0.22662682831287384,
      "sampling/importance_sampling_ratio/max": 1.3692384958267212,
      "sampling/importance_sampling_ratio/mean": 0.9901071786880493,
      "sampling/importance_sampling_ratio/min": 0.3076327443122864,
      "sampling/sampling_logp_difference/max": 1.179471731185913,
      "sampling/sampling_logp_difference/mean": 0.03242562711238861,
      "step": 47,
      "step_time": 26.895124169999463
    },
    {
      "clip_ratio/high_max": 0.0416666679084301,
      "clip_ratio/high_mean": 0.02083333395421505,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.033333334140479565,
      "entropy": 0.07248554454417899,
      "epoch": 0.00096,
      "grad_norm": 0.01555224135518074,
      "kl": 0.039988372170228104,
      "learning_rate": 7.99999978666724e-06,
      "loss": -0.0,
      "step": 48,
      "step_time": 11.803917615999808
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.06659889499132987,
      "epoch": 0.00098,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.007047319784760475,
      "kl": 0.038143942947499454,
      "learning_rate": 7.999999749630303e-06,
      "loss": 0.0001,
      "num_tokens": 2605752.0,
      "reward": 2.304872512817383,
      "reward_std": 0.4004109501838684,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -0.5201276540756226,
      "rewards/rollout_reward_func/std": 0.2584696114063263,
      "sampling/importance_sampling_ratio/max": 2.615042209625244,
      "sampling/importance_sampling_ratio/mean": 1.0269113779067993,
      "sampling/importance_sampling_ratio/min": 0.39808669686317444,
      "sampling/sampling_logp_difference/max": 0.9612793922424316,
      "sampling/sampling_logp_difference/mean": 0.03832431882619858,
      "step": 49,
      "step_time": 26.91781551100007
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.0618492451030761,
      "epoch": 0.001,
      "grad_norm": 0.00791104231029749,
      "kl": 0.05557279207035515,
      "learning_rate": 7.999999709630412e-06,
      "loss": 0.0001,
      "step": 50,
      "step_time": 12.788009578999208
    },
    {
      "clip_ratio/high_max": 0.02500000037252903,
      "clip_ratio/high_mean": 0.012500000186264515,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.012500000186264515,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.96875,
      "completions/mean_terminated_length": 2.96875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.05631835470558144,
      "epoch": 0.00102,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0038132003974169493,
      "kl": 0.029594353904632498,
      "learning_rate": 7.999999666667564e-06,
      "loss": 0.0,
      "num_tokens": 2707257.0,
      "reward": 2.346804618835449,
      "reward_std": 0.2936249077320099,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.96875,
      "rewards/probe_completion_length/std": 0.1767766922712326,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -0.571945309638977,
      "rewards/rollout_reward_func/std": 0.23333650827407837,
      "sampling/importance_sampling_ratio/max": 1.6730494499206543,
      "sampling/importance_sampling_ratio/mean": 0.9981693029403687,
      "sampling/importance_sampling_ratio/min": 0.40917959809303284,
      "sampling/sampling_logp_difference/max": 0.9063196182250977,
      "sampling/sampling_logp_difference/mean": 0.024803204461932182,
      "step": 51,
      "step_time": 26.73268852599972
    },
    {
      "clip_ratio/high_max": 0.02500000037252903,
      "clip_ratio/high_mean": 0.012500000186264515,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.012500000186264515,
      "entropy": 0.05670045691658743,
      "epoch": 0.00104,
      "grad_norm": 0.003768681548535824,
      "kl": 0.030258090482694455,
      "learning_rate": 7.999999620741765e-06,
      "loss": 0.0,
      "step": 52,
      "step_time": 11.579914525999584
    },
    {
      "clip_ratio/high_max": 0.02500000037252903,
      "clip_ratio/high_mean": 0.012500000186264515,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.012500000186264515,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.8125,
      "completions/mean_terminated_length": 2.8125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.1101932916790247,
      "epoch": 0.00106,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0030547163914889097,
      "kl": 0.01951221001081649,
      "learning_rate": 7.999999571853009e-06,
      "loss": 0.0,
      "num_tokens": 2811393.0,
      "reward": 2.1927480697631836,
      "reward_std": 0.406143456697464,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.8125,
      "rewards/probe_completion_length/std": 0.3965577781200409,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9884125590324402,
      "rewards/probe_shaping_dominance/std": 0.0655483826994896,
      "rewards/probe_terminal_raw/mean": 0.01143292710185051,
      "rewards/probe_terminal_raw/std": 0.06467439979314804,
      "rewards/rollout_reward_func/mean": -0.5695973038673401,
      "rewards/rollout_reward_func/std": 0.16589799523353577,
      "sampling/importance_sampling_ratio/max": 1.0527032613754272,
      "sampling/importance_sampling_ratio/mean": 0.9693626165390015,
      "sampling/importance_sampling_ratio/min": 0.5484977960586548,
      "sampling/sampling_logp_difference/max": 0.6245040893554688,
      "sampling/sampling_logp_difference/mean": 0.023702893406152725,
      "step": 53,
      "step_time": 27.233809398999938
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.02291666716337204,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.033333334140479565,
      "entropy": 0.11004186974605545,
      "epoch": 0.00108,
      "grad_norm": 0.006082055624574423,
      "kl": 0.04293493747854882,
      "learning_rate": 7.999999520001299e-06,
      "loss": 0.0,
      "step": 54,
      "step_time": 12.14583877500013
    },
    {
      "clip_ratio/high_max": 0.05208333395421505,
      "clip_ratio/high_mean": 0.026041666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.026041666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.0866635709971888,
      "epoch": 0.0011,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.005786150228232145,
      "kl": 0.045153988463084715,
      "learning_rate": 7.999999465186634e-06,
      "loss": 0.0,
      "num_tokens": 2914367.0,
      "reward": 2.3385372161865234,
      "reward_std": 0.3273521363735199,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -0.5177128314971924,
      "rewards/rollout_reward_func/std": 0.2579730451107025,
      "sampling/importance_sampling_ratio/max": 1.2267568111419678,
      "sampling/importance_sampling_ratio/mean": 0.9484584331512451,
      "sampling/importance_sampling_ratio/min": 0.5135900378227234,
      "sampling/sampling_logp_difference/max": 0.6663306355476379,
      "sampling/sampling_logp_difference/mean": 0.0320717915892601,
      "step": 55,
      "step_time": 26.36462075400027
    },
    {
      "clip_ratio/high_max": 0.0729166679084301,
      "clip_ratio/high_mean": 0.046875000931322575,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.046875000931322575,
      "entropy": 0.09378209740680177,
      "epoch": 0.00112,
      "grad_norm": 0.007270520552992821,
      "kl": 0.05788560025212064,
      "learning_rate": 7.999999407409014e-06,
      "loss": 0.0,
      "step": 56,
      "step_time": 11.583988187999921
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.8125,
      "completions/mean_terminated_length": 2.8125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.08144025912042707,
      "epoch": 0.00114,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.006977744400501251,
      "kl": 0.16513798182256778,
      "learning_rate": 7.99999934666844e-06,
      "loss": -0.0,
      "num_tokens": 3018848.0,
      "reward": 2.2243924140930176,
      "reward_std": 0.4345919191837311,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.8125,
      "rewards/probe_completion_length/std": 0.3965577781200409,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9774075150489807,
      "rewards/probe_shaping_dominance/std": 0.08980447798967361,
      "rewards/probe_terminal_raw/mean": 0.02489837259054184,
      "rewards/probe_terminal_raw/std": 0.1013173907995224,
      "rewards/rollout_reward_func/mean": -0.540413498878479,
      "rewards/rollout_reward_func/std": 0.20110559463500977,
      "sampling/importance_sampling_ratio/max": 2.1173288822174072,
      "sampling/importance_sampling_ratio/mean": 1.0253949165344238,
      "sampling/importance_sampling_ratio/min": 0.34861743450164795,
      "sampling/sampling_logp_difference/max": 1.0653817653656006,
      "sampling/sampling_logp_difference/mean": 0.03663061559200287,
      "step": 57,
      "step_time": 27.63207101699959
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.08666177466511726,
      "epoch": 0.00116,
      "grad_norm": 0.00648898771032691,
      "kl": 0.14551325980573893,
      "learning_rate": 7.999999282964912e-06,
      "loss": 0.0,
      "step": 58,
      "step_time": 12.149218646000236
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.0776638601673767,
      "epoch": 0.00118,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.006341388914734125,
      "kl": 0.1438233179026156,
      "learning_rate": 7.999999216298429e-06,
      "loss": 0.0,
      "num_tokens": 3118313.0,
      "reward": 2.337385654449463,
      "reward_std": 0.40537285804748535,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.9689397215843201,
      "rewards/probe_shaping_dominance/std": 0.09820227324962616,
      "rewards/probe_terminal_raw/mean": 0.03201219439506531,
      "rewards/probe_terminal_raw/std": 0.10123317688703537,
      "rewards/rollout_reward_func/mean": -0.5198163986206055,
      "rewards/rollout_reward_func/std": 0.24933888018131256,
      "sampling/importance_sampling_ratio/max": 1.642152190208435,
      "sampling/importance_sampling_ratio/mean": 0.9745345115661621,
      "sampling/importance_sampling_ratio/min": 0.32652705907821655,
      "sampling/sampling_logp_difference/max": 1.1220024824142456,
      "sampling/sampling_logp_difference/mean": 0.04093600809574127,
      "step": 59,
      "step_time": 26.148361385999806
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.08309375832322985,
      "epoch": 0.0012,
      "grad_norm": 0.009624861180782318,
      "kl": 0.15202067893005733,
      "learning_rate": 7.999999146668991e-06,
      "loss": 0.0,
      "step": 60,
      "step_time": 11.512923075000117
    },
    {
      "clip_ratio/high_max": 0.07083333469927311,
      "clip_ratio/high_mean": 0.035416667349636555,
      "clip_ratio/low_mean": 0.02083333395421505,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.056250001303851604,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.84375,
      "completions/mean_terminated_length": 2.84375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.10362166631966829,
      "epoch": 0.00122,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.012691067531704903,
      "kl": 0.22026659833500162,
      "learning_rate": 7.999999074076601e-06,
      "loss": 0.0001,
      "num_tokens": 3227556.0,
      "reward": 2.3282229900360107,
      "reward_std": 0.4200522303581238,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.84375,
      "rewards/probe_completion_length/std": 0.3689020276069641,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.972678005695343,
      "rewards/probe_shaping_dominance/std": 0.10808944702148438,
      "rewards/probe_terminal_raw/mean": 0.03125,
      "rewards/probe_terminal_raw/std": 0.12296734005212784,
      "rewards/rollout_reward_func/mean": -0.4694550037384033,
      "rewards/rollout_reward_func/std": 0.2165255844593048,
      "sampling/importance_sampling_ratio/max": 1.6590189933776855,
      "sampling/importance_sampling_ratio/mean": 0.9916884899139404,
      "sampling/importance_sampling_ratio/min": 0.47236600518226624,
      "sampling/sampling_logp_difference/max": 0.7500003576278687,
      "sampling/sampling_logp_difference/mean": 0.045740097761154175,
      "step": 61,
      "step_time": 28.188819883999713
    },
    {
      "clip_ratio/high_max": 0.07083333469927311,
      "clip_ratio/high_mean": 0.035416667349636555,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.04583333432674408,
      "entropy": 0.10312735941261053,
      "epoch": 0.00124,
      "grad_norm": 0.019286708906292915,
      "kl": 0.11081840936094522,
      "learning_rate": 7.999998998521257e-06,
      "loss": 0.0001,
      "step": 62,
      "step_time": 11.837706676999915
    },
    {
      "clip_ratio/high_max": 0.02500000037252903,
      "clip_ratio/high_mean": 0.012500000186264515,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02291666716337204,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.08664211053110193,
      "epoch": 0.00126,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.015839533880352974,
      "kl": 0.10350155318124621,
      "learning_rate": 7.999998920002956e-06,
      "loss": -0.0,
      "num_tokens": 3332394.0,
      "reward": 2.405167579650879,
      "reward_std": 0.46130281686782837,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.9712571501731873,
      "rewards/probe_shaping_dominance/std": 0.11318810284137726,
      "rewards/probe_terminal_raw/mean": 0.03125,
      "rewards/probe_terminal_raw/std": 0.12296734005212784,
      "rewards/rollout_reward_func/mean": -0.48483964800834656,
      "rewards/rollout_reward_func/std": 0.24800339341163635,
      "sampling/importance_sampling_ratio/max": 1.9499810934066772,
      "sampling/importance_sampling_ratio/mean": 0.9958123564720154,
      "sampling/importance_sampling_ratio/min": 0.30673947930336,
      "sampling/sampling_logp_difference/max": 0.8753989338874817,
      "sampling/sampling_logp_difference/mean": 0.03312094882130623,
      "step": 63,
      "step_time": 26.684526995999477
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.012500000186264515,
      "entropy": 0.08623273627017625,
      "epoch": 0.00128,
      "grad_norm": 0.022980431094765663,
      "kl": 0.11929617358450173,
      "learning_rate": 7.999998838521705e-06,
      "loss": -0.0,
      "step": 64,
      "step_time": 12.258063536000009
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.012500000186264515,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.8125,
      "completions/mean_terminated_length": 2.8125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.07359768182504922,
      "epoch": 0.0013,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.011483771726489067,
      "kl": 0.10457528214246281,
      "learning_rate": 7.999998754077496e-06,
      "loss": -0.0,
      "num_tokens": 3436726.0,
      "reward": 2.377361297607422,
      "reward_std": 0.5483381748199463,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.8125,
      "rewards/probe_completion_length/std": 0.3965577781200409,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -0.41638875007629395,
      "rewards/rollout_reward_func/std": 0.2915210723876953,
      "sampling/importance_sampling_ratio/max": 1.1877729892730713,
      "sampling/importance_sampling_ratio/mean": 0.9874942898750305,
      "sampling/importance_sampling_ratio/min": 0.26991596817970276,
      "sampling/sampling_logp_difference/max": 1.309645414352417,
      "sampling/sampling_logp_difference/mean": 0.027806004509329796,
      "step": 65,
      "step_time": 27.115505474999736
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.012500000186264515,
      "entropy": 0.06954771315213293,
      "epoch": 0.00132,
      "grad_norm": 0.011225158348679543,
      "kl": 0.4594924821127222,
      "learning_rate": 7.999998666670336e-06,
      "loss": -0.0,
      "step": 66,
      "step_time": 11.664916763999372
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.9375,
      "completions/mean_terminated_length": 2.9375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.05904226377606392,
      "epoch": 0.00134,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.012288263067603111,
      "kl": 0.0946728276903741,
      "learning_rate": 7.999998576300222e-06,
      "loss": -0.0,
      "num_tokens": 3541291.0,
      "reward": 2.2826719284057617,
      "reward_std": 0.36464667320251465,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.9375,
      "rewards/probe_completion_length/std": 0.24593468010425568,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9738304615020752,
      "rewards/probe_shaping_dominance/std": 0.0877876877784729,
      "rewards/probe_terminal_raw/mean": 0.03137703239917755,
      "rewards/probe_terminal_raw/std": 0.10557617992162704,
      "rewards/rollout_reward_func/mean": -0.6100356578826904,
      "rewards/rollout_reward_func/std": 0.23593732714653015,
      "sampling/importance_sampling_ratio/max": 1.271332859992981,
      "sampling/importance_sampling_ratio/mean": 0.9844968914985657,
      "sampling/importance_sampling_ratio/min": 0.3530118763446808,
      "sampling/sampling_logp_difference/max": 1.0369465351104736,
      "sampling/sampling_logp_difference/mean": 0.02148618921637535,
      "step": 67,
      "step_time": 26.421768857000643
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.06591829867102206,
      "epoch": 0.00136,
      "grad_norm": 0.01136076170951128,
      "kl": 0.09406092630524654,
      "learning_rate": 7.999998482967154e-06,
      "loss": -0.0,
      "step": 68,
      "step_time": 12.272947167999973
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.09870199719443917,
      "epoch": 0.00138,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.017969369888305664,
      "kl": 0.16196376640436938,
      "learning_rate": 7.999998386671134e-06,
      "loss": 0.0,
      "num_tokens": 3645068.0,
      "reward": 2.2971627712249756,
      "reward_std": 0.3776472806930542,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9890751838684082,
      "rewards/probe_shaping_dominance/std": 0.06179998442530632,
      "rewards/probe_terminal_raw/mean": 0.01092479657381773,
      "rewards/probe_terminal_raw/std": 0.06179998070001602,
      "rewards/rollout_reward_func/mean": -0.5590872764587402,
      "rewards/rollout_reward_func/std": 0.19611209630966187,
      "sampling/importance_sampling_ratio/max": 2.4048268795013428,
      "sampling/importance_sampling_ratio/mean": 0.9662601947784424,
      "sampling/importance_sampling_ratio/min": 0.0,
      "sampling/sampling_logp_difference/max": 1.3840640783309937,
      "sampling/sampling_logp_difference/mean": 0.0624161995947361,
      "step": 69,
      "step_time": 26.791781901999457
    },
    {
      "clip_ratio/high_max": 0.0416666679084301,
      "clip_ratio/high_mean": 0.02083333395421505,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "entropy": 0.1055372767150402,
      "epoch": 0.0014,
      "grad_norm": 0.006739933043718338,
      "kl": 0.17029937845654786,
      "learning_rate": 7.999998287412158e-06,
      "loss": 0.0,
      "step": 70,
      "step_time": 11.527228552999532
    },
    {
      "clip_ratio/high_max": 0.0416666679084301,
      "clip_ratio/high_mean": 0.02083333395421505,
      "clip_ratio/low_mean": 0.02083333395421505,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0416666679084301,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.96875,
      "completions/mean_terminated_length": 2.96875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.0755673204548657,
      "epoch": 0.00142,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0038206640165299177,
      "kl": 0.27058742146891746,
      "learning_rate": 7.99999818519023e-06,
      "loss": -0.0,
      "num_tokens": 3745050.0,
      "reward": 2.4418420791625977,
      "reward_std": 0.3276258409023285,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.96875,
      "rewards/probe_completion_length/std": 0.1767766922712326,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9443497061729431,
      "rewards/probe_shaping_dominance/std": 0.17115506529808044,
      "rewards/probe_terminal_raw/mean": 0.05487804859876633,
      "rewards/probe_terminal_raw/std": 0.15910547971725464,
      "rewards/rollout_reward_func/mean": -0.4761357307434082,
      "rewards/rollout_reward_func/std": 0.27386248111724854,
      "sampling/importance_sampling_ratio/max": 1.2027363777160645,
      "sampling/importance_sampling_ratio/mean": 0.9526693224906921,
      "sampling/importance_sampling_ratio/min": 0.26859819889068604,
      "sampling/sampling_logp_difference/max": 1.314541220664978,
      "sampling/sampling_logp_difference/mean": 0.04236820340156555,
      "step": 71,
      "step_time": 25.810139078000248
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "entropy": 0.07833350286819041,
      "epoch": 0.00144,
      "grad_norm": 0.006155087612569332,
      "kl": 0.15766439647995867,
      "learning_rate": 7.999998080005348e-06,
      "loss": -0.0,
      "step": 72,
      "step_time": 11.807300304999444
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.02291666716337204,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02291666716337204,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4.0,
      "completions/max_terminated_length": 4.0,
      "completions/mean_length": 2.78125,
      "completions/mean_terminated_length": 2.78125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.12527845823206007,
      "epoch": 0.00146,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.012825227342545986,
      "kl": 0.4211071440950036,
      "learning_rate": 7.999997971857512e-06,
      "loss": 0.0001,
      "num_tokens": 3846778.0,
      "reward": 2.290764570236206,
      "reward_std": 0.5837900042533875,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.78125,
      "rewards/probe_completion_length/std": 0.4908435642719269,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9892492890357971,
      "rewards/probe_shaping_dominance/std": 0.06081530824303627,
      "rewards/probe_terminal_raw/mean": 0.010797764174640179,
      "rewards/probe_terminal_raw/std": 0.06108137592673302,
      "rewards/rollout_reward_func/mean": -0.4405323565006256,
      "rewards/rollout_reward_func/std": 0.3242381811141968,
      "sampling/importance_sampling_ratio/max": 1.6338335275650024,
      "sampling/importance_sampling_ratio/mean": 0.9540376663208008,
      "sampling/importance_sampling_ratio/min": 0.19394879043102264,
      "sampling/sampling_logp_difference/max": 1.26481294631958,
      "sampling/sampling_logp_difference/mean": 0.07170334458351135,
      "step": 73,
      "step_time": 27.727274773000772
    },
    {
      "clip_ratio/high_max": 0.04583333432674408,
      "clip_ratio/high_mean": 0.035416667349636555,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.035416667349636555,
      "entropy": 0.1369485834147781,
      "epoch": 0.00148,
      "grad_norm": 0.006000218912959099,
      "kl": 0.3834730681264773,
      "learning_rate": 7.999997860746726e-06,
      "loss": 0.0,
      "step": 74,
      "step_time": 11.550198297999486
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.9375,
      "completions/mean_terminated_length": 2.9375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.05670425167772919,
      "epoch": 0.0015,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.004246127791702747,
      "kl": 0.26258886672280823,
      "learning_rate": 7.999997746672985e-06,
      "loss": 0.0001,
      "num_tokens": 3952684.0,
      "reward": 2.3076558113098145,
      "reward_std": 0.2708474397659302,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.9375,
      "rewards/probe_completion_length/std": 0.24593468010425568,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -0.5798441171646118,
      "rewards/rollout_reward_func/std": 0.21061494946479797,
      "sampling/importance_sampling_ratio/max": 1.4762965440750122,
      "sampling/importance_sampling_ratio/mean": 0.9765973091125488,
      "sampling/importance_sampling_ratio/min": 0.1482001394033432,
      "sampling/sampling_logp_difference/max": 1.9091930389404297,
      "sampling/sampling_logp_difference/mean": 0.034642815589904785,
      "step": 75,
      "step_time": 27.424144634000186
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "entropy": 0.06237753387540579,
      "epoch": 0.00152,
      "grad_norm": 0.005785573739558458,
      "kl": 0.34405436088127317,
      "learning_rate": 7.999997629636291e-06,
      "loss": 0.0001,
      "step": 76,
      "step_time": 12.303879873000824
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02291666716337204,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.96875,
      "completions/mean_terminated_length": 2.96875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.08415639377199113,
      "epoch": 0.00154,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.005243807099759579,
      "kl": 0.17415540551155573,
      "learning_rate": 7.999997509636644e-06,
      "loss": 0.0,
      "num_tokens": 4058589.0,
      "reward": 2.46805739402771,
      "reward_std": 0.32934877276420593,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.96875,
      "rewards/probe_completion_length/std": 0.1767766922712326,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9650155901908875,
      "rewards/probe_shaping_dominance/std": 0.11404264718294144,
      "rewards/probe_terminal_raw/mean": 0.04090446978807449,
      "rewards/probe_terminal_raw/std": 0.13221491873264313,
      "rewards/rollout_reward_func/mean": -0.45661279559135437,
      "rewards/rollout_reward_func/std": 0.2438260018825531,
      "sampling/importance_sampling_ratio/max": 1.467045783996582,
      "sampling/importance_sampling_ratio/mean": 0.9993070363998413,
      "sampling/importance_sampling_ratio/min": 0.5919517874717712,
      "sampling/sampling_logp_difference/max": 0.5126774311065674,
      "sampling/sampling_logp_difference/mean": 0.021975167095661163,
      "step": 77,
      "step_time": 27.026433300999997
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.08070441009476781,
      "epoch": 0.00156,
      "grad_norm": 0.0065447925589978695,
      "kl": 0.1744868414461962,
      "learning_rate": 7.999997386674047e-06,
      "loss": 0.0,
      "step": 78,
      "step_time": 11.744910646999415
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.07671235466841608,
      "epoch": 0.00158,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.007739327382296324,
      "kl": 0.10829602145804529,
      "learning_rate": 7.999997260748495e-06,
      "loss": 0.0,
      "num_tokens": 4163362.0,
      "reward": 2.291594982147217,
      "reward_std": 0.39855584502220154,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9897778034210205,
      "rewards/probe_shaping_dominance/std": 0.05782533064484596,
      "rewards/probe_terminal_raw/mean": 0.009908536449074745,
      "rewards/probe_terminal_raw/std": 0.05605114996433258,
      "rewards/rollout_reward_func/mean": -0.5330914855003357,
      "rewards/rollout_reward_func/std": 0.2664976716041565,
      "sampling/importance_sampling_ratio/max": 1.3343223333358765,
      "sampling/importance_sampling_ratio/mean": 0.9947078227996826,
      "sampling/importance_sampling_ratio/min": 0.4244631230831146,
      "sampling/sampling_logp_difference/max": 0.9074487686157227,
      "sampling/sampling_logp_difference/mean": 0.022345466539263725,
      "step": 79,
      "step_time": 27.107436816999325
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "entropy": 0.08035149308852851,
      "epoch": 0.0016,
      "grad_norm": 0.00506787933409214,
      "kl": 0.1221858259250439,
      "learning_rate": 7.999997131859992e-06,
      "loss": 0.0,
      "step": 80,
      "step_time": 12.165714977000334
    },
    {
      "clip_ratio/high_max": 0.02500000037252903,
      "clip_ratio/high_mean": 0.012500000186264515,
      "clip_ratio/low_mean": 0.031250000931322575,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.04375000111758709,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.1357073881663382,
      "epoch": 0.00162,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.008707523345947266,
      "kl": 0.19407588429749012,
      "learning_rate": 7.999997000008536e-06,
      "loss": 0.0,
      "num_tokens": 4264863.0,
      "reward": 2.4384140968322754,
      "reward_std": 0.4922390580177307,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.963716983795166,
      "rewards/probe_shaping_dominance/std": 0.11659354716539383,
      "rewards/probe_terminal_raw/mean": 0.03658536449074745,
      "rewards/probe_terminal_raw/std": 0.11809173226356506,
      "rewards/rollout_reward_func/mean": -0.44938817620277405,
      "rewards/rollout_reward_func/std": 0.28418225049972534,
      "sampling/importance_sampling_ratio/max": 1.7522894144058228,
      "sampling/importance_sampling_ratio/mean": 0.9879751205444336,
      "sampling/importance_sampling_ratio/min": 0.4941127300262451,
      "sampling/sampling_logp_difference/max": 0.5609221458435059,
      "sampling/sampling_logp_difference/mean": 0.03759397938847542,
      "step": 81,
      "step_time": 26.34822328099972
    },
    {
      "clip_ratio/high_max": 0.02500000037252903,
      "clip_ratio/high_mean": 0.012500000186264515,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.012500000186264515,
      "entropy": 0.14159703021869063,
      "epoch": 0.00164,
      "grad_norm": 0.009574824012815952,
      "kl": 0.1771204932992987,
      "learning_rate": 7.999996865194129e-06,
      "loss": 0.0,
      "step": 82,
      "step_time": 11.777719495999463
    },
    {
      "clip_ratio/high_max": 0.06250000186264515,
      "clip_ratio/high_mean": 0.031250000931322575,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.04375000111758709,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.11876969272270799,
      "epoch": 0.00166,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.010034332983195782,
      "kl": 0.36267855847108876,
      "learning_rate": 7.99999672741677e-06,
      "loss": 0.0001,
      "num_tokens": 4371298.0,
      "reward": 2.316115379333496,
      "reward_std": 0.4054742753505707,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9912324547767639,
      "rewards/probe_shaping_dominance/std": 0.049596767872571945,
      "rewards/probe_terminal_raw/mean": 0.009273373521864414,
      "rewards/probe_terminal_raw/std": 0.05245811864733696,
      "rewards/rollout_reward_func/mean": -0.5093902349472046,
      "rewards/rollout_reward_func/std": 0.24608401954174042,
      "sampling/importance_sampling_ratio/max": 1.394594430923462,
      "sampling/importance_sampling_ratio/mean": 0.9233759045600891,
      "sampling/importance_sampling_ratio/min": 0.08404743671417236,
      "sampling/sampling_logp_difference/max": 2.4710586071014404,
      "sampling/sampling_logp_difference/mean": 0.07214178144931793,
      "step": 83,
      "step_time": 27.42874688900065
    },
    {
      "clip_ratio/high_max": 0.0416666679084301,
      "clip_ratio/high_mean": 0.02083333395421505,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.033333334140479565,
      "entropy": 0.11967162042856216,
      "epoch": 0.00168,
      "grad_norm": 0.009677170775830746,
      "kl": 0.30461428755370434,
      "learning_rate": 7.999996586676458e-06,
      "loss": 0.0001,
      "step": 84,
      "step_time": 12.210796541999116
    },
    {
      "clip_ratio/high_max": 0.0416666679084301,
      "clip_ratio/high_mean": 0.02083333395421505,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.031250000931322575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.9375,
      "completions/mean_terminated_length": 2.9375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.08633493585512042,
      "epoch": 0.0017,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.009309964254498482,
      "kl": 0.34726120328798515,
      "learning_rate": 7.999996442973193e-06,
      "loss": -0.0,
      "num_tokens": 4476938.0,
      "reward": 2.3256678581237793,
      "reward_std": 0.3970645070075989,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.9375,
      "rewards/probe_completion_length/std": 0.24593468010425568,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -0.5930821895599365,
      "rewards/rollout_reward_func/std": 0.20994225144386292,
      "sampling/importance_sampling_ratio/max": 2.7198355197906494,
      "sampling/importance_sampling_ratio/mean": 0.965837836265564,
      "sampling/importance_sampling_ratio/min": 0.0,
      "sampling/sampling_logp_difference/max": 2.636561870574951,
      "sampling/sampling_logp_difference/mean": 0.07213791459798813,
      "step": 85,
      "step_time": 26.77135907899992
    },
    {
      "clip_ratio/high_max": 0.06250000186264515,
      "clip_ratio/high_mean": 0.031250000931322575,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0416666679084301,
      "entropy": 0.08549185702577233,
      "epoch": 0.00172,
      "grad_norm": 0.00986558198928833,
      "kl": 0.6476581503327452,
      "learning_rate": 7.99999629630698e-06,
      "loss": -0.0,
      "step": 86,
      "step_time": 11.659285754999019
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.78125,
      "completions/mean_terminated_length": 2.78125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.08913910732371733,
      "epoch": 0.00174,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.005745335482060909,
      "kl": 0.21945283197192111,
      "learning_rate": 7.999996146677813e-06,
      "loss": -0.0001,
      "num_tokens": 4579856.0,
      "reward": 2.2342212200164795,
      "reward_std": 0.5761978030204773,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.78125,
      "rewards/probe_completion_length/std": 0.420013427734375,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.985537052154541,
      "rewards/probe_shaping_dominance/std": 0.08181492984294891,
      "rewards/probe_terminal_raw/mean": 0.015625,
      "rewards/probe_terminal_raw/std": 0.0883883461356163,
      "rewards/rollout_reward_func/mean": -0.4981907904148102,
      "rewards/rollout_reward_func/std": 0.2684464752674103,
      "sampling/importance_sampling_ratio/max": 1.1302220821380615,
      "sampling/importance_sampling_ratio/mean": 0.9439641833305359,
      "sampling/importance_sampling_ratio/min": 0.0,
      "sampling/sampling_logp_difference/max": 1.921440839767456,
      "sampling/sampling_logp_difference/mean": 0.047181740403175354,
      "step": 87,
      "step_time": 27.09005630599995
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.08115326758706942,
      "epoch": 0.00176,
      "grad_norm": 0.003665071912109852,
      "kl": 0.22057799324602456,
      "learning_rate": 7.999995994085696e-06,
      "loss": -0.0001,
      "step": 88,
      "step_time": 12.136771756998769
    },
    {
      "clip_ratio/high_max": 0.02500000037252903,
      "clip_ratio/high_mean": 0.012500000186264515,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.012500000186264515,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.07007716363295913,
      "epoch": 0.00178,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.007810859940946102,
      "kl": 0.6949258089686055,
      "learning_rate": 7.999995838530628e-06,
      "loss": -0.0,
      "num_tokens": 4685612.0,
      "reward": 2.3873391151428223,
      "reward_std": 0.4150564968585968,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -0.5001606941223145,
      "rewards/rollout_reward_func/std": 0.2632400095462799,
      "sampling/importance_sampling_ratio/max": 1.329830527305603,
      "sampling/importance_sampling_ratio/mean": 0.9396188259124756,
      "sampling/importance_sampling_ratio/min": 0.09286217391490936,
      "sampling/sampling_logp_difference/max": 2.376638174057007,
      "sampling/sampling_logp_difference/mean": 0.05502761900424957,
      "step": 89,
      "step_time": 26.554008219000025
    },
    {
      "clip_ratio/high_max": 0.02500000037252903,
      "clip_ratio/high_mean": 0.012500000186264515,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.012500000186264515,
      "entropy": 0.07465687586227432,
      "epoch": 0.0018,
      "grad_norm": 0.009502755478024483,
      "kl": 0.22063382680062205,
      "learning_rate": 7.99999568001261e-06,
      "loss": -0.0,
      "step": 90,
      "step_time": 12.219043876999876
    },
    {
      "clip_ratio/high_max": 0.0416666679084301,
      "clip_ratio/high_mean": 0.02083333395421505,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.96875,
      "completions/mean_terminated_length": 2.96875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.14270146866329014,
      "epoch": 0.00182,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.008744009770452976,
      "kl": 0.11013963767254609,
      "learning_rate": 7.999995518531638e-06,
      "loss": -0.0001,
      "num_tokens": 4789951.0,
      "reward": 2.567716360092163,
      "reward_std": 0.9114633798599243,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 2.125,
      "rewards/probe_completion_length/std": 0.9069623351097107,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9740893840789795,
      "rewards/probe_shaping_dominance/std": 0.10259100794792175,
      "rewards/probe_terminal_raw/mean": 0.02515243925154209,
      "rewards/probe_terminal_raw/std": 0.10202876478433609,
      "rewards/rollout_reward_func/mean": -0.5065252184867859,
      "rewards/rollout_reward_func/std": 0.20758704841136932,
      "sampling/importance_sampling_ratio/max": 1.6487281322479248,
      "sampling/importance_sampling_ratio/mean": 0.9680857062339783,
      "sampling/importance_sampling_ratio/min": 0.3606947958469391,
      "sampling/sampling_logp_difference/max": 0.7544957399368286,
      "sampling/sampling_logp_difference/mean": 0.04080694913864136,
      "step": 91,
      "step_time": 26.54145688799963
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.04375000111758709,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.054166668094694614,
      "entropy": 0.1536610189359635,
      "epoch": 0.00184,
      "grad_norm": 0.0049968562088906765,
      "kl": 0.21468755277851415,
      "learning_rate": 7.999995354087718e-06,
      "loss": -0.0001,
      "step": 92,
      "step_time": 12.239923568000904
    },
    {
      "clip_ratio/high_max": 0.04583333432674408,
      "clip_ratio/high_mean": 0.02291666716337204,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02291666716337204,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.8125,
      "completions/mean_terminated_length": 2.8125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.09390545927453786,
      "epoch": 0.00186,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.00847246777266264,
      "kl": 0.4723499550793804,
      "learning_rate": 7.999995186680847e-06,
      "loss": -0.0,
      "num_tokens": 4891817.0,
      "reward": 2.240363121032715,
      "reward_std": 0.4286558926105499,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.8125,
      "rewards/probe_completion_length/std": 0.3965577781200409,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9915216565132141,
      "rewards/probe_shaping_dominance/std": 0.04796085134148598,
      "rewards/probe_terminal_raw/mean": 0.008511179126799107,
      "rewards/probe_terminal_raw/std": 0.04814650118350983,
      "rewards/rollout_reward_func/mean": -0.5221695899963379,
      "rewards/rollout_reward_func/std": 0.18585550785064697,
      "sampling/importance_sampling_ratio/max": 1.2803471088409424,
      "sampling/importance_sampling_ratio/mean": 0.9798120856285095,
      "sampling/importance_sampling_ratio/min": 0.28233107924461365,
      "sampling/sampling_logp_difference/max": 1.2646756172180176,
      "sampling/sampling_logp_difference/mean": 0.03255663067102432,
      "step": 93,
      "step_time": 26.499364807999882
    },
    {
      "clip_ratio/high_max": 0.04583333432674408,
      "clip_ratio/high_mean": 0.02291666716337204,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02291666716337204,
      "entropy": 0.09494142327457666,
      "epoch": 0.00188,
      "grad_norm": 0.005891559179872274,
      "kl": 0.4762792717665434,
      "learning_rate": 7.999995016311026e-06,
      "loss": -0.0,
      "step": 94,
      "step_time": 11.590511038999466
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.78125,
      "completions/mean_terminated_length": 2.78125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.0855805806349963,
      "epoch": 0.0019,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.010784839279949665,
      "kl": 0.5285673206672072,
      "learning_rate": 7.999994842978255e-06,
      "loss": 0.0,
      "num_tokens": 4999030.0,
      "reward": 2.307888984680176,
      "reward_std": 0.558517575263977,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.78125,
      "rewards/probe_completion_length/std": 0.420013427734375,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -0.4233608841896057,
      "rewards/rollout_reward_func/std": 0.2430049329996109,
      "sampling/importance_sampling_ratio/max": 2.3040266036987305,
      "sampling/importance_sampling_ratio/mean": 1.0930638313293457,
      "sampling/importance_sampling_ratio/min": 0.26607653498649597,
      "sampling/sampling_logp_difference/max": 1.3239718675613403,
      "sampling/sampling_logp_difference/mean": 0.0572347566485405,
      "step": 95,
      "step_time": 27.32456371700073
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.08265005028806627,
      "epoch": 0.00192,
      "grad_norm": 0.009639889933168888,
      "kl": 0.5285577713511884,
      "learning_rate": 7.999994666682534e-06,
      "loss": 0.0,
      "step": 96,
      "step_time": 12.08934896799974
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.96875,
      "completions/mean_terminated_length": 2.96875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.10442803846672177,
      "epoch": 0.00194,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.007832064293324947,
      "kl": 1.2743625693256035,
      "learning_rate": 7.999994487423863e-06,
      "loss": 0.0002,
      "num_tokens": 5101617.0,
      "reward": 2.3278391361236572,
      "reward_std": 0.21062178909778595,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.96875,
      "rewards/probe_completion_length/std": 0.1767766922712326,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -0.5909109115600586,
      "rewards/rollout_reward_func/std": 0.17344380915164948,
      "sampling/importance_sampling_ratio/max": 1.2738028764724731,
      "sampling/importance_sampling_ratio/mean": 0.8911948204040527,
      "sampling/importance_sampling_ratio/min": 0.0,
      "sampling/sampling_logp_difference/max": 2.880244493484497,
      "sampling/sampling_logp_difference/mean": 0.08490461856126785,
      "step": 97,
      "step_time": 26.761640363000424
    },
    {
      "clip_ratio/high_max": 0.04583333432674408,
      "clip_ratio/high_mean": 0.02291666716337204,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.033333334140479565,
      "entropy": 0.09561855113133788,
      "epoch": 0.00196,
      "grad_norm": 0.0042576780542731285,
      "kl": 0.8573908178368583,
      "learning_rate": 7.999994305202242e-06,
      "loss": 0.0002,
      "step": 98,
      "step_time": 12.239888331999737
    },
    {
      "clip_ratio/high_max": 0.04583333432674408,
      "clip_ratio/high_mean": 0.02291666716337204,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02291666716337204,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.12256050202995539,
      "epoch": 0.00198,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.03982119634747505,
      "kl": 0.4613347239792347,
      "learning_rate": 7.999994120017672e-06,
      "loss": 0.0,
      "num_tokens": 5208185.0,
      "reward": 2.3622024059295654,
      "reward_std": 0.3201013505458832,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9496574401855469,
      "rewards/probe_shaping_dominance/std": 0.13700911402702332,
      "rewards/probe_terminal_raw/mean": 0.0570375993847847,
      "rewards/probe_terminal_raw/std": 0.15571396052837372,
      "rewards/rollout_reward_func/mean": -0.5007427334785461,
      "rewards/rollout_reward_func/std": 0.2577684223651886,
      "sampling/importance_sampling_ratio/max": 2.246042490005493,
      "sampling/importance_sampling_ratio/mean": 1.0854158401489258,
      "sampling/importance_sampling_ratio/min": 0.0747772604227066,
      "sampling/sampling_logp_difference/max": 2.5932421684265137,
      "sampling/sampling_logp_difference/mean": 0.07237481325864792,
      "step": 99,
      "step_time": 28.563245160000406
    },
    {
      "clip_ratio/high_max": 0.05000000074505806,
      "clip_ratio/high_mean": 0.02500000037252903,
      "clip_ratio/low_mean": 0.02083333395421505,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.04583333432674408,
      "entropy": 0.11652607470750809,
      "epoch": 0.002,
      "grad_norm": 0.013196082785725594,
      "kl": 1.1047777848725673,
      "learning_rate": 7.999993931870152e-06,
      "loss": -0.0,
      "step": 100,
      "step_time": 11.832685018998745
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.96875,
      "completions/mean_terminated_length": 2.96875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.11155627248808742,
      "epoch": 0.00202,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.011043570004403591,
      "kl": 0.8486065305769444,
      "learning_rate": 7.999993740759685e-06,
      "loss": 0.0,
      "num_tokens": 5312092.0,
      "reward": 2.469048261642456,
      "reward_std": 0.296406090259552,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.96875,
      "rewards/probe_completion_length/std": 0.1767766922712326,
      "rewards/probe_invalid_count/mean": 0.0625,
      "rewards/probe_invalid_count/std": 0.24593468010425568,
      "rewards/probe_shaping_dominance/mean": 0.9914976358413696,
      "rewards/probe_shaping_dominance/std": 0.04809650778770447,
      "rewards/probe_terminal_raw/mean": 0.00889227632433176,
      "rewards/probe_terminal_raw/std": 0.05030231550335884,
      "rewards/rollout_reward_func/mean": -0.5125917196273804,
      "rewards/rollout_reward_func/std": 0.1837811917066574,
      "sampling/importance_sampling_ratio/max": 1.2519433498382568,
      "sampling/importance_sampling_ratio/mean": 0.8515626192092896,
      "sampling/importance_sampling_ratio/min": 0.08545338362455368,
      "sampling/sampling_logp_difference/max": 2.4583053588867188,
      "sampling/sampling_logp_difference/mean": 0.1055741012096405,
      "step": 101,
      "step_time": 28.246981163999408
    },
    {
      "clip_ratio/high_max": 0.0833333358168602,
      "clip_ratio/high_mean": 0.0416666679084301,
      "clip_ratio/low_mean": 0.031250000931322575,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.07291666883975267,
      "entropy": 0.10925065912306309,
      "epoch": 0.00204,
      "grad_norm": 0.008332287892699242,
      "kl": 0.7459432929754257,
      "learning_rate": 7.999993546686268e-06,
      "loss": 0.0,
      "step": 102,
      "step_time": 12.24685298599934
    },
    {
      "clip_ratio/high_max": 0.02500000037252903,
      "clip_ratio/high_mean": 0.012500000186264515,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.012500000186264515,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.84375,
      "completions/mean_terminated_length": 2.84375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.09578724391758442,
      "epoch": 0.00206,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.005429553799331188,
      "kl": 0.3181111275916919,
      "learning_rate": 7.999993349649902e-06,
      "loss": 0.0001,
      "num_tokens": 5417356.0,
      "reward": 2.296133279800415,
      "reward_std": 0.48034343123435974,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.84375,
      "rewards/probe_completion_length/std": 0.3689020276069641,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.983701229095459,
      "rewards/probe_shaping_dominance/std": 0.0652911514043808,
      "rewards/probe_terminal_raw/mean": 0.021214431151747704,
      "rewards/probe_terminal_raw/std": 0.08383625000715256,
      "rewards/rollout_reward_func/mean": -0.5025323629379272,
      "rewards/rollout_reward_func/std": 0.23934274911880493,
      "sampling/importance_sampling_ratio/max": 1.7521827220916748,
      "sampling/importance_sampling_ratio/mean": 1.0161978006362915,
      "sampling/importance_sampling_ratio/min": 0.559285044670105,
      "sampling/sampling_logp_difference/max": 0.5810226202011108,
      "sampling/sampling_logp_difference/mean": 0.03578226640820503,
      "step": 103,
      "step_time": 28.179791414999727
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.09543535858392715,
      "epoch": 0.00208,
      "grad_norm": 0.005383977200835943,
      "kl": 0.31692405231297016,
      "learning_rate": 7.999993149650587e-06,
      "loss": 0.0,
      "step": 104,
      "step_time": 11.594287923999673
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.02083333395421505,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.031250000931322575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.10437362408265471,
      "epoch": 0.0021,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.006486265454441309,
      "kl": 0.4273503478616476,
      "learning_rate": 7.999992946688324e-06,
      "loss": -0.0,
      "num_tokens": 5522766.0,
      "reward": 2.39151668548584,
      "reward_std": 0.39364051818847656,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.0625,
      "rewards/probe_invalid_count/std": 0.24593468010425568,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -0.5272334814071655,
      "rewards/rollout_reward_func/std": 0.2972264289855957,
      "sampling/importance_sampling_ratio/max": 1.9010006189346313,
      "sampling/importance_sampling_ratio/mean": 1.0246827602386475,
      "sampling/importance_sampling_ratio/min": 0.3678455054759979,
      "sampling/sampling_logp_difference/max": 1.0000989437103271,
      "sampling/sampling_logp_difference/mean": 0.03773331269621849,
      "step": 105,
      "step_time": 26.660096251999676
    },
    {
      "clip_ratio/high_max": 0.04583333432674408,
      "clip_ratio/high_mean": 0.02291666716337204,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.033333334140479565,
      "entropy": 0.09778932714834809,
      "epoch": 0.00212,
      "grad_norm": 0.005733635742217302,
      "kl": 0.36536745447665453,
      "learning_rate": 7.999992740763114e-06,
      "loss": -0.0,
      "step": 106,
      "step_time": 12.020263065000563
    },
    {
      "clip_ratio/high_max": 0.02500000037252903,
      "clip_ratio/high_mean": 0.012500000186264515,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02291666716337204,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.84375,
      "completions/mean_terminated_length": 2.84375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.09189990477170795,
      "epoch": 0.00214,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.006950075738132,
      "kl": 0.37158518051728606,
      "learning_rate": 7.999992531874955e-06,
      "loss": 0.0,
      "num_tokens": 5624278.0,
      "reward": 2.3239517211914062,
      "reward_std": 0.4278637170791626,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.84375,
      "rewards/probe_completion_length/std": 0.3689020276069641,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9810667037963867,
      "rewards/probe_shaping_dominance/std": 0.0745052620768547,
      "rewards/probe_terminal_raw/mean": 0.021214431151747704,
      "rewards/probe_terminal_raw/std": 0.08405215293169022,
      "rewards/rollout_reward_func/mean": -0.472079336643219,
      "rewards/rollout_reward_func/std": 0.24182648956775665,
      "sampling/importance_sampling_ratio/max": 1.8587580919265747,
      "sampling/importance_sampling_ratio/mean": 0.9948133230209351,
      "sampling/importance_sampling_ratio/min": 0.488203763961792,
      "sampling/sampling_logp_difference/max": 0.6990102529525757,
      "sampling/sampling_logp_difference/mean": 0.03366800397634506,
      "step": 107,
      "step_time": 27.280253950999395
    },
    {
      "clip_ratio/high_max": 0.06666666828095913,
      "clip_ratio/high_mean": 0.033333334140479565,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.04375000111758709,
      "entropy": 0.07755104900570586,
      "epoch": 0.00216,
      "grad_norm": 0.0029529579915106297,
      "kl": 0.3871547483528275,
      "learning_rate": 7.99999232002385e-06,
      "loss": 0.0,
      "step": 108,
      "step_time": 11.582099404000473
    },
    {
      "clip_ratio/high_max": 0.04583333432674408,
      "clip_ratio/high_mean": 0.02291666716337204,
      "clip_ratio/low_mean": 0.02291666716337204,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.04583333432674408,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.9375,
      "completions/mean_terminated_length": 2.9375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.08607161836698651,
      "epoch": 0.00218,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.004912302363663912,
      "kl": 0.3110020191234071,
      "learning_rate": 7.999992105209796e-06,
      "loss": 0.0,
      "num_tokens": 5730240.0,
      "reward": 2.3713436126708984,
      "reward_std": 0.34508299827575684,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.9375,
      "rewards/probe_completion_length/std": 0.24593468010425568,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9868378639221191,
      "rewards/probe_shaping_dominance/std": 0.07445620000362396,
      "rewards/probe_terminal_raw/mean": 0.015625,
      "rewards/probe_terminal_raw/std": 0.0883883461356163,
      "rewards/rollout_reward_func/mean": -0.5186194181442261,
      "rewards/rollout_reward_func/std": 0.22763106226921082,
      "sampling/importance_sampling_ratio/max": 2.4666221141815186,
      "sampling/importance_sampling_ratio/mean": 0.9437046051025391,
      "sampling/importance_sampling_ratio/min": 0.16313567757606506,
      "sampling/sampling_logp_difference/max": 1.8131763935089111,
      "sampling/sampling_logp_difference/mean": 0.07055296003818512,
      "step": 109,
      "step_time": 27.85804966900014
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.02083333395421505,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.031250000931322575,
      "entropy": 0.08376848418265581,
      "epoch": 0.0022,
      "grad_norm": 0.021030370146036148,
      "kl": 0.3346872879192233,
      "learning_rate": 7.999991887432795e-06,
      "loss": 0.0,
      "step": 110,
      "step_time": 12.221424097000181
    },
    {
      "clip_ratio/high_max": 0.03125,
      "clip_ratio/high_mean": 0.015625,
      "clip_ratio/low_mean": 0.02083333395421505,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.03645833395421505,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.84375,
      "completions/mean_terminated_length": 2.84375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.09564799422514625,
      "epoch": 0.00222,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.010623163543641567,
      "kl": 1.25646445970051,
      "learning_rate": 7.999991666692848e-06,
      "loss": 0.0001,
      "num_tokens": 5834866.0,
      "reward": 2.371830463409424,
      "reward_std": 0.455732524394989,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.84375,
      "rewards/probe_completion_length/std": 0.3689020276069641,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.9698338508605957,
      "rewards/probe_shaping_dominance/std": 0.118809275329113,
      "rewards/probe_terminal_raw/mean": 0.02909044735133648,
      "rewards/probe_terminal_raw/std": 0.11480555683374405,
      "rewards/rollout_reward_func/mean": -0.45209401845932007,
      "rewards/rollout_reward_func/std": 0.2390637993812561,
      "sampling/importance_sampling_ratio/max": 2.435302972793579,
      "sampling/importance_sampling_ratio/mean": 0.9616929292678833,
      "sampling/importance_sampling_ratio/min": 0.18086190521717072,
      "sampling/sampling_logp_difference/max": 1.7100262641906738,
      "sampling/sampling_logp_difference/mean": 0.06157621741294861,
      "step": 111,
      "step_time": 27.536669213000096
    },
    {
      "clip_ratio/high_max": 0.05625000037252903,
      "clip_ratio/high_mean": 0.028125000186264515,
      "clip_ratio/low_mean": 0.031250000931322575,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.05937500111758709,
      "entropy": 0.09540150425164029,
      "epoch": 0.00224,
      "grad_norm": 0.005310059990733862,
      "kl": 0.7572433853056282,
      "learning_rate": 7.999991442989953e-06,
      "loss": 0.0001,
      "step": 112,
      "step_time": 11.58020766800064
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.9375,
      "completions/mean_terminated_length": 2.9375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.05377835238323314,
      "epoch": 0.00226,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0054777092300355434,
      "kl": 0.2139036045409739,
      "learning_rate": 7.999991216324112e-06,
      "loss": 0.0,
      "num_tokens": 5941971.0,
      "reward": 2.3715004920959473,
      "reward_std": 0.3570369482040405,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.9375,
      "rewards/probe_completion_length/std": 0.24593468010425568,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.9854661822319031,
      "rewards/probe_shaping_dominance/std": 0.08221564441919327,
      "rewards/probe_terminal_raw/mean": 0.015625,
      "rewards/probe_terminal_raw/std": 0.0883883461356163,
      "rewards/rollout_reward_func/mean": -0.5483407378196716,
      "rewards/rollout_reward_func/std": 0.21500766277313232,
      "sampling/importance_sampling_ratio/max": 1.468092441558838,
      "sampling/importance_sampling_ratio/mean": 1.0448389053344727,
      "sampling/importance_sampling_ratio/min": 0.9520513415336609,
      "sampling/sampling_logp_difference/max": 0.38396334648132324,
      "sampling/sampling_logp_difference/mean": 0.014699834398925304,
      "step": 113,
      "step_time": 26.95208743199919
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.05231661406651256,
      "epoch": 0.00228,
      "grad_norm": 0.005958650726824999,
      "kl": 0.20708634098750167,
      "learning_rate": 7.999990986695325e-06,
      "loss": 0.0,
      "step": 114,
      "step_time": 12.898005667000234
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.9375,
      "completions/mean_terminated_length": 2.9375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.1277365549467504,
      "epoch": 0.0023,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.010303696617484093,
      "kl": 0.5558968242257833,
      "learning_rate": 7.999990754103591e-06,
      "loss": -0.0,
      "num_tokens": 6048989.0,
      "reward": 2.3545703887939453,
      "reward_std": 0.32267555594444275,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.9375,
      "rewards/probe_completion_length/std": 0.24593468010425568,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -0.5329294204711914,
      "rewards/rollout_reward_func/std": 0.1960861086845398,
      "sampling/importance_sampling_ratio/max": 2.528221368789673,
      "sampling/importance_sampling_ratio/mean": 0.9982080459594727,
      "sampling/importance_sampling_ratio/min": 0.042695675045251846,
      "sampling/sampling_logp_difference/max": 3.153654098510742,
      "sampling/sampling_logp_difference/mean": 0.08483341336250305,
      "step": 115,
      "step_time": 28.715120017999652
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.031250000931322575,
      "clip_ratio/low_min": 0.02083333395421505,
      "clip_ratio/region_mean": 0.0416666679084301,
      "entropy": 0.1117813317105174,
      "epoch": 0.00232,
      "grad_norm": 0.006610220763832331,
      "kl": 0.6069826502352953,
      "learning_rate": 7.99999051854891e-06,
      "loss": -0.0,
      "step": 116,
      "step_time": 12.037885646000177
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.035416667349636555,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.035416667349636555,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.08630842622369528,
      "epoch": 0.00234,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.023052336648106575,
      "kl": 4.202049997946233,
      "learning_rate": 7.999990280031285e-06,
      "loss": -0.0,
      "num_tokens": 6156241.0,
      "reward": 2.3509585857391357,
      "reward_std": 0.3719061613082886,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -0.5052914619445801,
      "rewards/rollout_reward_func/std": 0.26930469274520874,
      "sampling/importance_sampling_ratio/max": 1.4201393127441406,
      "sampling/importance_sampling_ratio/mean": 0.9191266298294067,
      "sampling/importance_sampling_ratio/min": 0.04002097621560097,
      "sampling/sampling_logp_difference/max": 3.218353271484375,
      "sampling/sampling_logp_difference/mean": 0.08381534367799759,
      "step": 117,
      "step_time": 27.4478307280001
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.08493484603241086,
      "epoch": 0.00236,
      "grad_norm": 0.005157412961125374,
      "kl": 0.8633453572015242,
      "learning_rate": 7.999990038550715e-06,
      "loss": -0.0001,
      "step": 118,
      "step_time": 12.410220233000018
    },
    {
      "clip_ratio/high_max": 0.02500000037252903,
      "clip_ratio/high_mean": 0.012500000186264515,
      "clip_ratio/low_mean": 0.02083333395421505,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.033333334140479565,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.1009751778037753,
      "epoch": 0.00238,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.007704886142164469,
      "kl": 1.133708338191262,
      "learning_rate": 7.9999897941072e-06,
      "loss": -0.0,
      "num_tokens": 6261608.0,
      "reward": 2.272282600402832,
      "reward_std": 0.4321046769618988,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -0.5527174472808838,
      "rewards/rollout_reward_func/std": 0.2261652648448944,
      "sampling/importance_sampling_ratio/max": 1.9247888326644897,
      "sampling/importance_sampling_ratio/mean": 0.9601424932479858,
      "sampling/importance_sampling_ratio/min": 0.10850485414266586,
      "sampling/sampling_logp_difference/max": 2.221635580062866,
      "sampling/sampling_logp_difference/mean": 0.06387770175933838,
      "step": 119,
      "step_time": 27.243004307998945
    },
    {
      "clip_ratio/high_max": 0.04583333432674408,
      "clip_ratio/high_mean": 0.02291666716337204,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.033333334140479565,
      "entropy": 0.10485226087621413,
      "epoch": 0.0024,
      "grad_norm": 0.005486879497766495,
      "kl": 0.7662449008450487,
      "learning_rate": 7.999989546700739e-06,
      "loss": -0.0,
      "step": 120,
      "step_time": 11.642901553001138
    },
    {
      "clip_ratio/high_max": 0.0416666679084301,
      "clip_ratio/high_mean": 0.02083333395421505,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.05734692560508847,
      "epoch": 0.00242,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0022395749110728502,
      "kl": 0.4620458657536801,
      "learning_rate": 7.999989296331334e-06,
      "loss": 0.0,
      "num_tokens": 6364884.0,
      "reward": 2.300528049468994,
      "reward_std": 0.3925109803676605,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9786701202392578,
      "rewards/probe_shaping_dominance/std": 0.08399670571088791,
      "rewards/probe_terminal_raw/mean": 0.020452234894037247,
      "rewards/probe_terminal_raw/std": 0.08055972307920456,
      "rewards/rollout_reward_func/mean": -0.5235942602157593,
      "rewards/rollout_reward_func/std": 0.19283899664878845,
      "sampling/importance_sampling_ratio/max": 1.684720754623413,
      "sampling/importance_sampling_ratio/mean": 0.9979562163352966,
      "sampling/importance_sampling_ratio/min": 0.3297406733036041,
      "sampling/sampling_logp_difference/max": 1.109449863433838,
      "sampling/sampling_logp_difference/mean": 0.03222563862800598,
      "step": 121,
      "step_time": 27.102160742999786
    },
    {
      "clip_ratio/high_max": 0.0416666679084301,
      "clip_ratio/high_mean": 0.02083333395421505,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "entropy": 0.06036481284536421,
      "epoch": 0.00244,
      "grad_norm": 0.0021346518769860268,
      "kl": 0.460031573350534,
      "learning_rate": 7.999989042998983e-06,
      "loss": 0.0,
      "step": 122,
      "step_time": 12.627941945999737
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.08197857672348619,
      "epoch": 0.00246,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.005835927091538906,
      "kl": 0.3058228840382071,
      "learning_rate": 7.99998878670369e-06,
      "loss": -0.0,
      "num_tokens": 6470259.0,
      "reward": 2.4272561073303223,
      "reward_std": 0.2215338796377182,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 2.0,
      "rewards/probe_completion_length/std": 0.0,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9598830342292786,
      "rewards/probe_shaping_dominance/std": 0.13210204243659973,
      "rewards/probe_terminal_raw/mean": 0.04026930779218674,
      "rewards/probe_terminal_raw/std": 0.13092826306819916,
      "rewards/rollout_reward_func/mean": -0.5228960514068604,
      "rewards/rollout_reward_func/std": 0.22377446293830872,
      "sampling/importance_sampling_ratio/max": 1.2321637868881226,
      "sampling/importance_sampling_ratio/mean": 0.9182083606719971,
      "sampling/importance_sampling_ratio/min": 0.0,
      "sampling/sampling_logp_difference/max": 1.2927324771881104,
      "sampling/sampling_logp_difference/mean": 0.04780565947294235,
      "step": 123,
      "step_time": 27.481588907000514
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.02083333395421505,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "entropy": 0.0761918865609914,
      "epoch": 0.00248,
      "grad_norm": 0.005192534998059273,
      "kl": 0.32337066042236984,
      "learning_rate": 7.999988527445453e-06,
      "loss": -0.0,
      "step": 124,
      "step_time": 11.74153527999988
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.96875,
      "completions/mean_terminated_length": 2.96875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.061301857323996956,
      "epoch": 0.0025,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.004524994175881147,
      "kl": 0.20127144705232547,
      "learning_rate": 7.99998826522427e-06,
      "loss": -0.0,
      "num_tokens": 6573122.0,
      "reward": 2.5412168502807617,
      "reward_std": 0.4934008717536926,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.96875,
      "rewards/probe_completion_length/std": 0.1767766922712326,
      "rewards/probe_invalid_count/mean": 0.0625,
      "rewards/probe_invalid_count/std": 0.3535533845424652,
      "rewards/probe_shaping_dominance/mean": 0.9729976058006287,
      "rewards/probe_shaping_dominance/std": 0.10630916804075241,
      "rewards/probe_terminal_raw/mean": 0.028963414952158928,
      "rewards/probe_terminal_raw/std": 0.11434794962406158,
      "rewards/rollout_reward_func/mean": -0.44199419021606445,
      "rewards/rollout_reward_func/std": 0.23288173973560333,
      "sampling/importance_sampling_ratio/max": 2.8899707794189453,
      "sampling/importance_sampling_ratio/mean": 1.0233311653137207,
      "sampling/importance_sampling_ratio/min": 0.5645219683647156,
      "sampling/sampling_logp_difference/max": 1.0612452030181885,
      "sampling/sampling_logp_difference/mean": 0.02934853918850422,
      "step": 125,
      "step_time": 26.56314809100013
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.0587867568974616,
      "epoch": 0.00252,
      "grad_norm": 0.003286329098045826,
      "kl": 0.23132333873703226,
      "learning_rate": 7.999988000040144e-06,
      "loss": -0.0,
      "step": 126,
      "step_time": 12.704706686999543
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.012500000186264515,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.9375,
      "completions/mean_terminated_length": 2.9375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.06930449209176004,
      "epoch": 0.00254,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0038534458726644516,
      "kl": 0.8923099512467161,
      "learning_rate": 7.999987731893076e-06,
      "loss": -0.0001,
      "num_tokens": 6674759.0,
      "reward": 2.476976156234741,
      "reward_std": 0.5018807053565979,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.9375,
      "rewards/probe_completion_length/std": 0.24593468010425568,
      "rewards/probe_invalid_count/mean": 0.09375,
      "rewards/probe_invalid_count/std": 0.39015090465545654,
      "rewards/probe_shaping_dominance/mean": 0.9897805452346802,
      "rewards/probe_shaping_dominance/std": 0.057810164988040924,
      "rewards/probe_terminal_raw/mean": 0.010797764174640179,
      "rewards/probe_terminal_raw/std": 0.06108137592673302,
      "rewards/rollout_reward_func/mean": -0.5048520565032959,
      "rewards/rollout_reward_func/std": 0.23183932900428772,
      "sampling/importance_sampling_ratio/max": 2.6555376052856445,
      "sampling/importance_sampling_ratio/mean": 1.037369728088379,
      "sampling/importance_sampling_ratio/min": 0.18285271525382996,
      "sampling/sampling_logp_difference/max": 1.6990761756896973,
      "sampling/sampling_logp_difference/mean": 0.04799798130989075,
      "step": 127,
      "step_time": 26.519593818999965
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.02291666716337204,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02291666716337204,
      "entropy": 0.07739454251714051,
      "epoch": 0.00256,
      "grad_norm": 0.0046963742934167385,
      "kl": 0.8950551702291705,
      "learning_rate": 7.999987460783066e-06,
      "loss": -0.0001,
      "step": 128,
      "step_time": 11.701040565999392
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.02291666716337204,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02291666716337204,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.9375,
      "completions/mean_terminated_length": 2.9375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.04215445008594543,
      "epoch": 0.00258,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.004038817714899778,
      "kl": 0.483372636698145,
      "learning_rate": 7.999987186710111e-06,
      "loss": -0.0001,
      "num_tokens": 6778164.0,
      "reward": 2.3669238090515137,
      "reward_std": 0.33272045850753784,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.9375,
      "rewards/probe_completion_length/std": 0.24593468010425568,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9665718078613281,
      "rewards/probe_shaping_dominance/std": 0.11594124883413315,
      "rewards/probe_terminal_raw/mean": 0.033663615584373474,
      "rewards/probe_terminal_raw/std": 0.11093832552433014,
      "rewards/rollout_reward_func/mean": -0.5208115577697754,
      "rewards/rollout_reward_func/std": 0.22583386301994324,
      "sampling/importance_sampling_ratio/max": 1.324372410774231,
      "sampling/importance_sampling_ratio/mean": 0.9827702045440674,
      "sampling/importance_sampling_ratio/min": 0.15934889018535614,
      "sampling/sampling_logp_difference/max": 1.8366597890853882,
      "sampling/sampling_logp_difference/mean": 0.03050372563302517,
      "step": 129,
      "step_time": 29.272002608000093
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.012500000186264515,
      "entropy": 0.040716532384976745,
      "epoch": 0.0026,
      "grad_norm": 0.004598686005920172,
      "kl": 0.48791675676284285,
      "learning_rate": 7.999986909674215e-06,
      "loss": -0.0001,
      "step": 130,
      "step_time": 11.615075072000309
    },
    {
      "clip_ratio/high_max": 0.0416666679084301,
      "clip_ratio/high_mean": 0.02083333395421505,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.031250000931322575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.9375,
      "completions/mean_terminated_length": 2.9375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.07680852155863249,
      "epoch": 0.00262,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.004599301610141993,
      "kl": 0.5561261102557182,
      "learning_rate": 7.999986629675377e-06,
      "loss": 0.0001,
      "num_tokens": 6881343.0,
      "reward": 2.428385019302368,
      "reward_std": 0.35835328698158264,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.9375,
      "rewards/probe_completion_length/std": 0.24593468010425568,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.988267183303833,
      "rewards/probe_shaping_dominance/std": 0.06637061387300491,
      "rewards/probe_terminal_raw/mean": 0.015625,
      "rewards/probe_terminal_raw/std": 0.0883883461356163,
      "rewards/rollout_reward_func/mean": -0.4630073308944702,
      "rewards/rollout_reward_func/std": 0.23799148201942444,
      "sampling/importance_sampling_ratio/max": 2.105088472366333,
      "sampling/importance_sampling_ratio/mean": 1.0250680446624756,
      "sampling/importance_sampling_ratio/min": 0.24339471757411957,
      "sampling/sampling_logp_difference/max": 1.413072109222412,
      "sampling/sampling_logp_difference/mean": 0.05859563127160072,
      "step": 131,
      "step_time": 27.499229768000532
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "entropy": 0.07459181371518753,
      "epoch": 0.00264,
      "grad_norm": 0.0046109952963888645,
      "kl": 0.4819548297673464,
      "learning_rate": 7.999986346713597e-06,
      "loss": 0.0001,
      "step": 132,
      "step_time": 11.681140706999486
    },
    {
      "clip_ratio/high_max": 0.0416666679084301,
      "clip_ratio/high_mean": 0.02083333395421505,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.06963892979547381,
      "epoch": 0.00266,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.004053663462400436,
      "kl": 0.29985905811190605,
      "learning_rate": 7.999986060788874e-06,
      "loss": -0.0001,
      "num_tokens": 6984936.0,
      "reward": 2.398922920227051,
      "reward_std": 0.3926793932914734,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9907037019729614,
      "rewards/probe_shaping_dominance/std": 0.052587706595659256,
      "rewards/probe_terminal_raw/mean": 0.007876016199588776,
      "rewards/probe_terminal_raw/std": 0.0445534773170948,
      "rewards/rollout_reward_func/mean": -0.45590683817863464,
      "rewards/rollout_reward_func/std": 0.20304201543331146,
      "sampling/importance_sampling_ratio/max": 1.1057724952697754,
      "sampling/importance_sampling_ratio/mean": 0.917495846748352,
      "sampling/importance_sampling_ratio/min": 0.2753896415233612,
      "sampling/sampling_logp_difference/max": 1.2891517877578735,
      "sampling/sampling_logp_difference/mean": 0.049349602311849594,
      "step": 133,
      "step_time": 28.668226430000686
    },
    {
      "clip_ratio/high_max": 0.0416666679084301,
      "clip_ratio/high_mean": 0.02083333395421505,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "entropy": 0.07001902349293232,
      "epoch": 0.00268,
      "grad_norm": 0.0046079279854893684,
      "kl": 0.30660303554032,
      "learning_rate": 7.999985771901212e-06,
      "loss": -0.0001,
      "step": 134,
      "step_time": 11.78814972499913
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.96875,
      "completions/mean_terminated_length": 2.96875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.0837576383491978,
      "epoch": 0.0027,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.004887988790869713,
      "kl": 0.48908784112427384,
      "learning_rate": 7.999985480050609e-06,
      "loss": 0.0,
      "num_tokens": 7089375.0,
      "reward": 2.383143901824951,
      "reward_std": 0.2860008180141449,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.96875,
      "rewards/probe_completion_length/std": 0.1767766922712326,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9715699553489685,
      "rewards/probe_shaping_dominance/std": 0.11188202351331711,
      "rewards/probe_terminal_raw/mean": 0.03125,
      "rewards/probe_terminal_raw/std": 0.12296734005212784,
      "rewards/rollout_reward_func/mean": -0.5384261608123779,
      "rewards/rollout_reward_func/std": 0.24632836878299713,
      "sampling/importance_sampling_ratio/max": 2.175699234008789,
      "sampling/importance_sampling_ratio/mean": 0.9764343500137329,
      "sampling/importance_sampling_ratio/min": 0.37150871753692627,
      "sampling/sampling_logp_difference/max": 1.0082650184631348,
      "sampling/sampling_logp_difference/mean": 0.04385855793952942,
      "step": 135,
      "step_time": 27.26713926100001
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.07945482805371284,
      "epoch": 0.00272,
      "grad_norm": 0.005393319763243198,
      "kl": 0.4894396271556616,
      "learning_rate": 7.999985185237063e-06,
      "loss": 0.0,
      "step": 136,
      "step_time": 11.740167015000225
    },
    {
      "clip_ratio/high_max": 0.012500000186264515,
      "clip_ratio/high_mean": 0.0062500000931322575,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.016666667070239782,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 7.0,
      "completions/max_terminated_length": 7.0,
      "completions/mean_length": 2.9375,
      "completions/mean_terminated_length": 2.9375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.07358018541708589,
      "epoch": 0.00274,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.04956609383225441,
      "kl": 7.594387605204247,
      "learning_rate": 7.999984887460579e-06,
      "loss": 0.0,
      "num_tokens": 7195651.0,
      "reward": 2.523413896560669,
      "reward_std": 1.283755898475647,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 2.0,
      "rewards/probe_completion_length/std": 1.1639753580093384,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.9845632314682007,
      "rewards/probe_shaping_dominance/std": 0.08732341974973679,
      "rewards/probe_terminal_raw/mean": 0.015625,
      "rewards/probe_terminal_raw/std": 0.0883883461356163,
      "rewards/rollout_reward_func/mean": -0.4580242931842804,
      "rewards/rollout_reward_func/std": 0.29842740297317505,
      "sampling/importance_sampling_ratio/max": 1.5995361804962158,
      "sampling/importance_sampling_ratio/mean": 0.9101204872131348,
      "sampling/importance_sampling_ratio/min": 0.2878796458244324,
      "sampling/sampling_logp_difference/max": 1.2452144622802734,
      "sampling/sampling_logp_difference/mean": 0.08170486986637115,
      "step": 137,
      "step_time": 35.5617492829997
    },
    {
      "clip_ratio/high_max": 0.04583333432674408,
      "clip_ratio/high_mean": 0.02291666716337204,
      "clip_ratio/low_mean": 0.02291666716337204,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.04583333432674408,
      "entropy": 0.0833338184747845,
      "epoch": 0.00276,
      "grad_norm": 0.004238603170961142,
      "kl": 0.8713670628203545,
      "learning_rate": 7.999984586721153e-06,
      "loss": -0.0001,
      "step": 138,
      "step_time": 13.092057540999122
    },
    {
      "clip_ratio/high_max": 0.02500000037252903,
      "clip_ratio/high_mean": 0.012500000186264515,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02291666716337204,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.9375,
      "completions/mean_terminated_length": 2.9375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.077066877449397,
      "epoch": 0.00278,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.006206016521900892,
      "kl": 0.2502201258515315,
      "learning_rate": 7.999984283018788e-06,
      "loss": -0.0001,
      "num_tokens": 7298420.0,
      "reward": 2.434345006942749,
      "reward_std": 0.33564823865890503,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.9375,
      "rewards/probe_completion_length/std": 0.24593468010425568,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -0.4531550407409668,
      "rewards/rollout_reward_func/std": 0.23981256783008575,
      "sampling/importance_sampling_ratio/max": 1.5575754642486572,
      "sampling/importance_sampling_ratio/mean": 0.9945090413093567,
      "sampling/importance_sampling_ratio/min": 0.39499369263648987,
      "sampling/sampling_logp_difference/max": 0.9288842678070068,
      "sampling/sampling_logp_difference/mean": 0.0369817316532135,
      "step": 139,
      "step_time": 26.636275078999915
    },
    {
      "clip_ratio/high_max": 0.04583333432674408,
      "clip_ratio/high_mean": 0.02291666716337204,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.033333334140479565,
      "entropy": 0.08048825367586687,
      "epoch": 0.0028,
      "grad_norm": 0.004995269235223532,
      "kl": 0.1949386877240613,
      "learning_rate": 7.999983976353484e-06,
      "loss": -0.0001,
      "step": 140,
      "step_time": 11.886442712999724
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.02291666716337204,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.033333334140479565,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.09608687367290258,
      "epoch": 0.00282,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.010193965397775173,
      "kl": 1.043814627239044,
      "learning_rate": 7.99998366672524e-06,
      "loss": 0.0001,
      "num_tokens": 7400213.0,
      "reward": 2.357463836669922,
      "reward_std": 0.45996955037117004,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9802613258361816,
      "rewards/probe_shaping_dominance/std": 0.0787685438990593,
      "rewards/probe_terminal_raw/mean": 0.017403453588485718,
      "rewards/probe_terminal_raw/std": 0.06857709586620331,
      "rewards/rollout_reward_func/mean": -0.46520087122917175,
      "rewards/rollout_reward_func/std": 0.23765753209590912,
      "sampling/importance_sampling_ratio/max": 2.0903208255767822,
      "sampling/importance_sampling_ratio/mean": 1.064300775527954,
      "sampling/importance_sampling_ratio/min": 0.2817336320877075,
      "sampling/sampling_logp_difference/max": 1.266794204711914,
      "sampling/sampling_logp_difference/mean": 0.04518420994281769,
      "step": 141,
      "step_time": 27.64493636099951
    },
    {
      "clip_ratio/high_max": 0.0416666679084301,
      "clip_ratio/high_mean": 0.02083333395421505,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.031250000931322575,
      "entropy": 0.10309219686314464,
      "epoch": 0.00284,
      "grad_norm": 0.01219659112393856,
      "kl": 0.6812123054987751,
      "learning_rate": 7.999983354134058e-06,
      "loss": 0.0,
      "step": 142,
      "step_time": 11.569478897000408
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.07752494711894542,
      "epoch": 0.00286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.004447088576853275,
      "kl": 0.28799188635699124,
      "learning_rate": 7.999983038579937e-06,
      "loss": -0.0002,
      "num_tokens": 7502202.0,
      "reward": 2.4029557704925537,
      "reward_std": 0.41433292627334595,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -0.4220443069934845,
      "rewards/rollout_reward_func/std": 0.2347659468650818,
      "sampling/importance_sampling_ratio/max": 2.925204277038574,
      "sampling/importance_sampling_ratio/mean": 1.0200954675674438,
      "sampling/importance_sampling_ratio/min": 0.2386324405670166,
      "sampling/sampling_logp_difference/max": 1.4322543144226074,
      "sampling/sampling_logp_difference/mean": 0.04332014173269272,
      "step": 143,
      "step_time": 27.17438340000035
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.07426338468212634,
      "epoch": 0.00288,
      "grad_norm": 0.004469662439078093,
      "kl": 0.2410876297701634,
      "learning_rate": 7.999982720062878e-06,
      "loss": -0.0002,
      "step": 144,
      "step_time": 12.213636597999539
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.08634203940164298,
      "epoch": 0.0029,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.002921090926975012,
      "kl": 0.230285348889538,
      "learning_rate": 7.99998239858288e-06,
      "loss": 0.0,
      "num_tokens": 7607649.0,
      "reward": 2.3042469024658203,
      "reward_std": 0.4113651216030121,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -0.5207530856132507,
      "rewards/rollout_reward_func/std": 0.2033592164516449,
      "sampling/importance_sampling_ratio/max": 1.081487774848938,
      "sampling/importance_sampling_ratio/mean": 0.961658239364624,
      "sampling/importance_sampling_ratio/min": 0.3403857946395874,
      "sampling/sampling_logp_difference/max": 0.7405810356140137,
      "sampling/sampling_logp_difference/mean": 0.02413717657327652,
      "step": 145,
      "step_time": 28.17627675400081
    },
    {
      "clip_ratio/high_max": 0.0416666679084301,
      "clip_ratio/high_mean": 0.02083333395421505,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "entropy": 0.08849173790076748,
      "epoch": 0.00292,
      "grad_norm": 0.0025327985640615225,
      "kl": 0.24220079024462393,
      "learning_rate": 7.999982074139944e-06,
      "loss": 0.0,
      "step": 146,
      "step_time": 11.552079900000535
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.9375,
      "completions/mean_terminated_length": 2.9375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.11541430978104472,
      "epoch": 0.00294,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0037195596378296614,
      "kl": 0.24169684358639643,
      "learning_rate": 7.999981746734073e-06,
      "loss": -0.0001,
      "num_tokens": 7714926.0,
      "reward": 2.362529754638672,
      "reward_std": 0.3588845729827881,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.9375,
      "rewards/probe_completion_length/std": 0.24593468010425568,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9859415292739868,
      "rewards/probe_shaping_dominance/std": 0.07952678948640823,
      "rewards/probe_terminal_raw/mean": 0.015625,
      "rewards/probe_terminal_raw/std": 0.0883883461356163,
      "rewards/rollout_reward_func/mean": -0.5265366435050964,
      "rewards/rollout_reward_func/std": 0.2366112768650055,
      "sampling/importance_sampling_ratio/max": 1.8165228366851807,
      "sampling/importance_sampling_ratio/mean": 1.0579065084457397,
      "sampling/importance_sampling_ratio/min": 0.4353120028972626,
      "sampling/sampling_logp_difference/max": 0.826627790927887,
      "sampling/sampling_logp_difference/mean": 0.04029189795255661,
      "step": 147,
      "step_time": 27.175546237000162
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "entropy": 0.1161738510709256,
      "epoch": 0.00296,
      "grad_norm": 0.0037887210492044687,
      "kl": 0.23712664423510432,
      "learning_rate": 7.999981416365263e-06,
      "loss": -0.0,
      "step": 148,
      "step_time": 12.20823843899916
    },
    {
      "clip_ratio/high_max": 0.04583333432674408,
      "clip_ratio/high_mean": 0.02291666716337204,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02291666716337204,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.105490946007194,
      "epoch": 0.00298,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.005545547232031822,
      "kl": 0.10429394743793807,
      "learning_rate": 7.999981083033518e-06,
      "loss": -0.0,
      "num_tokens": 7820271.0,
      "reward": 2.2831099033355713,
      "reward_std": 0.39255067706108093,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -0.5418901443481445,
      "rewards/rollout_reward_func/std": 0.2250201553106308,
      "sampling/importance_sampling_ratio/max": 1.449048399925232,
      "sampling/importance_sampling_ratio/mean": 0.9792050719261169,
      "sampling/importance_sampling_ratio/min": 0.2817993760108948,
      "sampling/sampling_logp_difference/max": 1.2665607929229736,
      "sampling/sampling_logp_difference/mean": 0.03002801164984703,
      "step": 149,
      "step_time": 27.53580150099924
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.10439014286384918,
      "epoch": 0.003,
      "grad_norm": 0.00822756253182888,
      "kl": 0.11194274778247859,
      "learning_rate": 7.999980746738835e-06,
      "loss": -0.0,
      "step": 150,
      "step_time": 11.669001740000112
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.1254521356895566,
      "epoch": 0.00302,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.008205846883356571,
      "kl": 0.2568075335584581,
      "learning_rate": 7.999980407481217e-06,
      "loss": -0.0,
      "num_tokens": 7922328.0,
      "reward": 2.4083704948425293,
      "reward_std": 0.3905543088912964,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9785879850387573,
      "rewards/probe_shaping_dominance/std": 0.08963118493556976,
      "rewards/probe_terminal_raw/mean": 0.0260416679084301,
      "rewards/probe_terminal_raw/std": 0.1046360433101654,
      "rewards/rollout_reward_func/mean": -0.45250916481018066,
      "rewards/rollout_reward_func/std": 0.25463223457336426,
      "sampling/importance_sampling_ratio/max": 1.165947437286377,
      "sampling/importance_sampling_ratio/mean": 0.9090801477432251,
      "sampling/importance_sampling_ratio/min": 0.0,
      "sampling/sampling_logp_difference/max": 1.9794785976409912,
      "sampling/sampling_logp_difference/mean": 0.06048261374235153,
      "step": 151,
      "step_time": 25.965173581000272
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.012500000186264515,
      "entropy": 0.11868520639836788,
      "epoch": 0.00304,
      "grad_norm": 0.008953132666647434,
      "kl": 0.6233456870540977,
      "learning_rate": 7.999980065260663e-06,
      "loss": -0.0001,
      "step": 152,
      "step_time": 12.843935258000784
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02291666716337204,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.96875,
      "completions/mean_terminated_length": 2.96875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.11048904561903328,
      "epoch": 0.00306,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.00968129187822342,
      "kl": 0.14061896470107627,
      "learning_rate": 7.999979720077173e-06,
      "loss": -0.0,
      "num_tokens": 8026423.0,
      "reward": 2.419642925262451,
      "reward_std": 0.30986252427101135,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.96875,
      "rewards/probe_completion_length/std": 0.1767766922712326,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9758727550506592,
      "rewards/probe_shaping_dominance/std": 0.10938115417957306,
      "rewards/probe_terminal_raw/mean": 0.0209603663533926,
      "rewards/probe_terminal_raw/std": 0.09247327595949173,
      "rewards/rollout_reward_func/mean": -0.49594029784202576,
      "rewards/rollout_reward_func/std": 0.2378591150045395,
      "sampling/importance_sampling_ratio/max": 1.1600902080535889,
      "sampling/importance_sampling_ratio/mean": 0.9520583152770996,
      "sampling/importance_sampling_ratio/min": 0.5003088712692261,
      "sampling/sampling_logp_difference/max": 0.6657150983810425,
      "sampling/sampling_logp_difference/mean": 0.025925474241375923,
      "step": 153,
      "step_time": 26.941947170000276
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "entropy": 0.1122089575510472,
      "epoch": 0.00308,
      "grad_norm": 0.00867843721061945,
      "kl": 0.15484224071647645,
      "learning_rate": 7.99997937193075e-06,
      "loss": -0.0,
      "step": 154,
      "step_time": 11.658896313999776
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.0629729179199785,
      "epoch": 0.0031,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.003953923936933279,
      "kl": 0.03362982640601331,
      "learning_rate": 7.99997902082139e-06,
      "loss": 0.0,
      "num_tokens": 8134364.0,
      "reward": 2.304103374481201,
      "reward_std": 0.3902580142021179,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9852296113967896,
      "rewards/probe_shaping_dominance/std": 0.08355414122343063,
      "rewards/probe_terminal_raw/mean": 0.015625,
      "rewards/probe_terminal_raw/std": 0.0883883461356163,
      "rewards/rollout_reward_func/mean": -0.5217512845993042,
      "rewards/rollout_reward_func/std": 0.20511233806610107,
      "sampling/importance_sampling_ratio/max": 1.2205973863601685,
      "sampling/importance_sampling_ratio/mean": 0.9658781290054321,
      "sampling/importance_sampling_ratio/min": 0.46778079867362976,
      "sampling/sampling_logp_difference/max": 0.7597565650939941,
      "sampling/sampling_logp_difference/mean": 0.021998237818479538,
      "step": 155,
      "step_time": 27.223922481999125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.06562891032081097,
      "epoch": 0.00312,
      "grad_norm": 0.004405137151479721,
      "kl": 0.038039611198541934,
      "learning_rate": 7.999978666749097e-06,
      "loss": 0.0,
      "step": 156,
      "step_time": 12.512135376999595
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.96875,
      "completions/mean_terminated_length": 2.96875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.05827112344559282,
      "epoch": 0.00314,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.004053921438753605,
      "kl": 0.22048271807530284,
      "learning_rate": 7.99997830971387e-06,
      "loss": -0.0,
      "num_tokens": 8238748.0,
      "reward": 2.4397072792053223,
      "reward_std": 0.3176124691963196,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.96875,
      "rewards/probe_completion_length/std": 0.1767766922712326,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.9892078638076782,
      "rewards/probe_shaping_dominance/std": 0.061049580574035645,
      "rewards/probe_terminal_raw/mean": 0.010670731775462627,
      "rewards/probe_terminal_raw/std": 0.06036277487874031,
      "rewards/rollout_reward_func/mean": -0.5101712346076965,
      "rewards/rollout_reward_func/std": 0.20784814655780792,
      "sampling/importance_sampling_ratio/max": 1.6952624320983887,
      "sampling/importance_sampling_ratio/mean": 0.9711546301841736,
      "sampling/importance_sampling_ratio/min": 0.0,
      "sampling/sampling_logp_difference/max": 1.1546943187713623,
      "sampling/sampling_logp_difference/mean": 0.03182876855134964,
      "step": 157,
      "step_time": 27.540745071999936
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.031250000931322575,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.031250000931322575,
      "entropy": 0.058620097348466516,
      "epoch": 0.00316,
      "grad_norm": 0.0032319524325430393,
      "kl": 0.2064171105599364,
      "learning_rate": 7.999977949715709e-06,
      "loss": -0.0,
      "step": 158,
      "step_time": 11.632630814000095
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.96875,
      "completions/mean_terminated_length": 2.96875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.08823958231369033,
      "epoch": 0.00318,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.005462405737489462,
      "kl": 0.09290702206544665,
      "learning_rate": 7.999977586754615e-06,
      "loss": 0.0001,
      "num_tokens": 8341164.0,
      "reward": 2.443883180618286,
      "reward_std": 0.2663474678993225,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.96875,
      "rewards/probe_completion_length/std": 0.1767766922712326,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9600480794906616,
      "rewards/probe_shaping_dominance/std": 0.12650074064731598,
      "rewards/probe_terminal_raw/mean": 0.046875,
      "rewards/probe_terminal_raw/std": 0.1480722874403,
      "rewards/rollout_reward_func/mean": -0.48178985714912415,
      "rewards/rollout_reward_func/std": 0.22425328195095062,
      "sampling/importance_sampling_ratio/max": 1.382658839225769,
      "sampling/importance_sampling_ratio/mean": 1.018369197845459,
      "sampling/importance_sampling_ratio/min": 0.8050516247749329,
      "sampling/sampling_logp_difference/max": 0.3240091800689697,
      "sampling/sampling_logp_difference/mean": 0.023685907945036888,
      "step": 159,
      "step_time": 27.411928095999883
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.08342378272209316,
      "epoch": 0.0032,
      "grad_norm": 0.0198823194950819,
      "kl": 0.08883899757620384,
      "learning_rate": 7.999977220830588e-06,
      "loss": 0.0001,
      "step": 160,
      "step_time": 12.353684361999967
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.06228045103489421,
      "epoch": 0.00322,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.002511651022359729,
      "kl": 0.1462944263475947,
      "learning_rate": 7.999976851943628e-06,
      "loss": -0.0,
      "num_tokens": 8445224.0,
      "reward": 2.391735076904297,
      "reward_std": 0.3887004256248474,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -0.4645148813724518,
      "rewards/rollout_reward_func/std": 0.24512337148189545,
      "sampling/importance_sampling_ratio/max": 1.2499885559082031,
      "sampling/importance_sampling_ratio/mean": 0.964512288570404,
      "sampling/importance_sampling_ratio/min": 0.2849932909011841,
      "sampling/sampling_logp_difference/max": 1.2552961111068726,
      "sampling/sampling_logp_difference/mean": 0.02673853561282158,
      "step": 161,
      "step_time": 26.90330324300021
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.07126606599922525,
      "epoch": 0.00324,
      "grad_norm": 0.00517527898773551,
      "kl": 0.13863739833080524,
      "learning_rate": 7.999976480093737e-06,
      "loss": -0.0,
      "step": 162,
      "step_time": 11.688447676000578
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.07072257142863236,
      "epoch": 0.00326,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.004865987226366997,
      "kl": 0.1391429503753443,
      "learning_rate": 7.999976105280914e-06,
      "loss": -0.0,
      "num_tokens": 8551746.0,
      "reward": 2.3334262371063232,
      "reward_std": 0.42871803045272827,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9307848215103149,
      "rewards/probe_shaping_dominance/std": 0.1679239422082901,
      "rewards/probe_terminal_raw/mean": 0.07113821059465408,
      "rewards/probe_terminal_raw/std": 0.1717527210712433,
      "rewards/rollout_reward_func/mean": -0.5247467756271362,
      "rewards/rollout_reward_func/std": 0.24572212994098663,
      "sampling/importance_sampling_ratio/max": 1.3134804964065552,
      "sampling/importance_sampling_ratio/mean": 1.0010151863098145,
      "sampling/importance_sampling_ratio/min": 0.42815467715263367,
      "sampling/sampling_logp_difference/max": 0.8482714891433716,
      "sampling/sampling_logp_difference/mean": 0.01988227292895317,
      "step": 163,
      "step_time": 28.07267034399956
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.07096506169182248,
      "epoch": 0.00328,
      "grad_norm": 0.004104274325072765,
      "kl": 0.13155441358685493,
      "learning_rate": 7.99997572750516e-06,
      "loss": -0.0,
      "step": 164,
      "step_time": 11.647160391999023
    },
    {
      "clip_ratio/high_max": 0.0416666679084301,
      "clip_ratio/high_mean": 0.02083333395421505,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.031250000931322575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.078909770467817,
      "epoch": 0.0033,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.004251962527632713,
      "kl": 0.09027766038946083,
      "learning_rate": 7.999975346766472e-06,
      "loss": -0.0,
      "num_tokens": 8658732.0,
      "reward": 2.414771795272827,
      "reward_std": 0.3757838010787964,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.9784373044967651,
      "rewards/probe_shaping_dominance/std": 0.08622659742832184,
      "rewards/probe_terminal_raw/mean": 0.024517275393009186,
      "rewards/probe_terminal_raw/std": 0.10027948766946793,
      "rewards/rollout_reward_func/mean": -0.47568273544311523,
      "rewards/rollout_reward_func/std": 0.19167323410511017,
      "sampling/importance_sampling_ratio/max": 1.1542701721191406,
      "sampling/importance_sampling_ratio/mean": 0.9669894576072693,
      "sampling/importance_sampling_ratio/min": 0.6857547163963318,
      "sampling/sampling_logp_difference/max": 0.37537309527397156,
      "sampling/sampling_logp_difference/mean": 0.017938656732439995,
      "step": 165,
      "step_time": 27.2606650000007
    },
    {
      "clip_ratio/high_max": 0.0416666679084301,
      "clip_ratio/high_mean": 0.02083333395421505,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.031250000931322575,
      "entropy": 0.07565992117088172,
      "epoch": 0.00332,
      "grad_norm": 0.006961170118302107,
      "kl": 0.08890455095081506,
      "learning_rate": 7.999974963064855e-06,
      "loss": -0.0,
      "step": 166,
      "step_time": 11.698157390000233
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.84375,
      "completions/mean_terminated_length": 2.84375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.07434030482545495,
      "epoch": 0.00334,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.004832255654036999,
      "kl": 0.15626501338783783,
      "learning_rate": 7.999974576400308e-06,
      "loss": -0.0,
      "num_tokens": 8765380.0,
      "reward": 2.2938361167907715,
      "reward_std": 0.4383181631565094,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.84375,
      "rewards/probe_completion_length/std": 0.3689020276069641,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.960378885269165,
      "rewards/probe_shaping_dominance/std": 0.12596461176872253,
      "rewards/probe_terminal_raw/mean": 0.046875,
      "rewards/probe_terminal_raw/std": 0.1480722874403,
      "rewards/rollout_reward_func/mean": -0.5071678757667542,
      "rewards/rollout_reward_func/std": 0.2304636836051941,
      "sampling/importance_sampling_ratio/max": 1.6727243661880493,
      "sampling/importance_sampling_ratio/mean": 1.0108327865600586,
      "sampling/importance_sampling_ratio/min": 0.4802703857421875,
      "sampling/sampling_logp_difference/max": 0.737343966960907,
      "sampling/sampling_logp_difference/mean": 0.023180868476629257,
      "step": 167,
      "step_time": 28.38192438599981
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.0768249062821269,
      "epoch": 0.00336,
      "grad_norm": 0.0052077267318964005,
      "kl": 0.15163502033101395,
      "learning_rate": 7.999974186772832e-06,
      "loss": -0.0,
      "step": 168,
      "step_time": 11.745391591000953
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.11408041534014046,
      "epoch": 0.00338,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.005346886347979307,
      "kl": 0.05663721589365878,
      "learning_rate": 7.999973794182426e-06,
      "loss": 0.0,
      "num_tokens": 8871458.0,
      "reward": 2.347496271133423,
      "reward_std": 0.37117481231689453,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.984038233757019,
      "rewards/probe_shaping_dominance/std": 0.09029316157102585,
      "rewards/probe_terminal_raw/mean": 0.015625,
      "rewards/probe_terminal_raw/std": 0.0883883461356163,
      "rewards/rollout_reward_func/mean": -0.477167010307312,
      "rewards/rollout_reward_func/std": 0.2263534814119339,
      "sampling/importance_sampling_ratio/max": 1.2048081159591675,
      "sampling/importance_sampling_ratio/mean": 0.967424750328064,
      "sampling/importance_sampling_ratio/min": 0.7366955280303955,
      "sampling/sampling_logp_difference/max": 0.3062773644924164,
      "sampling/sampling_logp_difference/mean": 0.022138062864542007,
      "step": 169,
      "step_time": 26.940671711000505
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.11010659678140655,
      "epoch": 0.0034,
      "grad_norm": 0.006358719430863857,
      "kl": 0.05905036644250572,
      "learning_rate": 7.99997339862909e-06,
      "loss": 0.0,
      "step": 170,
      "step_time": 12.187279679998937
    },
    {
      "clip_ratio/high_max": 0.06666666828095913,
      "clip_ratio/high_mean": 0.033333334140479565,
      "clip_ratio/low_mean": 0.035416667349636555,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.06875000149011612,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.10378921253141016,
      "epoch": 0.00342,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.004866220988333225,
      "kl": 0.3513250324758701,
      "learning_rate": 7.999973000112826e-06,
      "loss": -0.0,
      "num_tokens": 8977121.0,
      "reward": 2.3662233352661133,
      "reward_std": 0.36591798067092896,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -0.490026593208313,
      "rewards/rollout_reward_func/std": 0.1713269054889679,
      "sampling/importance_sampling_ratio/max": 2.4813146591186523,
      "sampling/importance_sampling_ratio/mean": 1.0544798374176025,
      "sampling/importance_sampling_ratio/min": 0.5539883375167847,
      "sampling/sampling_logp_difference/max": 0.9087880849838257,
      "sampling/sampling_logp_difference/mean": 0.04017889127135277,
      "step": 171,
      "step_time": 27.655318435999106
    },
    {
      "clip_ratio/high_max": 0.06666666828095913,
      "clip_ratio/high_mean": 0.033333334140479565,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.04375000111758709,
      "entropy": 0.10798206774052233,
      "epoch": 0.00344,
      "grad_norm": 0.012118767946958542,
      "kl": 0.39312139721005224,
      "learning_rate": 7.999972598633632e-06,
      "loss": -0.0,
      "step": 172,
      "step_time": 11.631308623997938
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.06574001582339406,
      "epoch": 0.00346,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.004147569183260202,
      "kl": 0.01835462471728988,
      "learning_rate": 7.999972194191514e-06,
      "loss": 0.0001,
      "num_tokens": 9080753.0,
      "reward": 2.3741204738616943,
      "reward_std": 0.33386632800102234,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9828835725784302,
      "rewards/probe_shaping_dominance/std": 0.06857176870107651,
      "rewards/probe_terminal_raw/mean": 0.016895325854420662,
      "rewards/probe_terminal_raw/std": 0.067360520362854,
      "rewards/rollout_reward_func/mean": -0.48190829157829285,
      "rewards/rollout_reward_func/std": 0.23477764427661896,
      "sampling/importance_sampling_ratio/max": 2.0903360843658447,
      "sampling/importance_sampling_ratio/mean": 1.0450650453567505,
      "sampling/importance_sampling_ratio/min": 0.8843300342559814,
      "sampling/sampling_logp_difference/max": 0.7373225688934326,
      "sampling/sampling_logp_difference/mean": 0.01723039150238037,
      "step": 173,
      "step_time": 26.502221221999207
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.06844324560370296,
      "epoch": 0.00348,
      "grad_norm": 0.0040916260331869125,
      "kl": 0.022212313354311064,
      "learning_rate": 7.999971786786465e-06,
      "loss": 0.0001,
      "step": 174,
      "step_time": 11.897189610000169
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.9375,
      "completions/mean_terminated_length": 2.9375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.07524242554791272,
      "epoch": 0.0035,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.005887735169380903,
      "kl": 0.22349138231948018,
      "learning_rate": 7.99997137641849e-06,
      "loss": -0.0,
      "num_tokens": 9185715.0,
      "reward": 2.4274468421936035,
      "reward_std": 0.30020296573638916,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.9375,
      "rewards/probe_completion_length/std": 0.24593468010425568,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9728903770446777,
      "rewards/probe_shaping_dominance/std": 0.10675826668739319,
      "rewards/probe_terminal_raw/mean": 0.03125,
      "rewards/probe_terminal_raw/std": 0.12296734005212784,
      "rewards/rollout_reward_func/mean": -0.46419334411621094,
      "rewards/rollout_reward_func/std": 0.211602121591568,
      "sampling/importance_sampling_ratio/max": 1.1768231391906738,
      "sampling/importance_sampling_ratio/mean": 0.9632406830787659,
      "sampling/importance_sampling_ratio/min": 0.32605040073394775,
      "sampling/sampling_logp_difference/max": 1.1148320436477661,
      "sampling/sampling_logp_difference/mean": 0.02662883885204792,
      "step": 175,
      "step_time": 27.599337874999037
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "entropy": 0.0704325451515615,
      "epoch": 0.00352,
      "grad_norm": 0.004202236421406269,
      "kl": 0.2313449110952206,
      "learning_rate": 7.999970963087587e-06,
      "loss": -0.0,
      "step": 176,
      "step_time": 11.622392715999013
    },
    {
      "clip_ratio/high_max": 0.02500000037252903,
      "clip_ratio/high_mean": 0.012500000186264515,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.012500000186264515,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.13769991835579276,
      "epoch": 0.00354,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.008693602867424488,
      "kl": 0.1878440118744038,
      "learning_rate": 7.99997054679376e-06,
      "loss": -0.0001,
      "num_tokens": 9289277.0,
      "reward": 2.358966588973999,
      "reward_std": 0.3925982713699341,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9915453791618347,
      "rewards/probe_shaping_dominance/std": 0.04782645031809807,
      "rewards/probe_terminal_raw/mean": 0.006986788474023342,
      "rewards/probe_terminal_raw/std": 0.03952324390411377,
      "rewards/rollout_reward_func/mean": -0.4958154261112213,
      "rewards/rollout_reward_func/std": 0.18107342720031738,
      "sampling/importance_sampling_ratio/max": 1.5426419973373413,
      "sampling/importance_sampling_ratio/mean": 0.9988285303115845,
      "sampling/importance_sampling_ratio/min": 0.43416687846183777,
      "sampling/sampling_logp_difference/max": 0.5040676593780518,
      "sampling/sampling_logp_difference/mean": 0.04200742021203041,
      "step": 177,
      "step_time": 27.01749301200016
    },
    {
      "clip_ratio/high_max": 0.06666666828095913,
      "clip_ratio/high_mean": 0.033333334140479565,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.04375000111758709,
      "entropy": 0.13739392068237066,
      "epoch": 0.00356,
      "grad_norm": 0.00472621712833643,
      "kl": 0.1952200917294249,
      "learning_rate": 7.999970127537005e-06,
      "loss": -0.0001,
      "step": 178,
      "step_time": 12.335309556999164
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.96875,
      "completions/mean_terminated_length": 2.96875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.07692393008619547,
      "epoch": 0.00358,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.028508227318525314,
      "kl": 5.385302404543381,
      "learning_rate": 7.999969705317325e-06,
      "loss": 0.0001,
      "num_tokens": 9389166.0,
      "reward": 2.4562783241271973,
      "reward_std": 0.2598528265953064,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.96875,
      "rewards/probe_completion_length/std": 0.1767766922712326,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -0.4624716639518738,
      "rewards/rollout_reward_func/std": 0.18222831189632416,
      "sampling/importance_sampling_ratio/max": 1.1737666130065918,
      "sampling/importance_sampling_ratio/mean": 0.9517749547958374,
      "sampling/importance_sampling_ratio/min": 0.2871549129486084,
      "sampling/sampling_logp_difference/max": 1.2477340698242188,
      "sampling/sampling_logp_difference/mean": 0.038661930710077286,
      "step": 179,
      "step_time": 26.827888970999993
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.02083333395421505,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "entropy": 0.08128447085618973,
      "epoch": 0.0036,
      "grad_norm": 0.00963876023888588,
      "kl": 2.0179060684172327,
      "learning_rate": 7.99996928013472e-06,
      "loss": 0.0001,
      "step": 180,
      "step_time": 11.37347329900058
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.02083333395421505,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.031250000931322575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.10767973656766117,
      "epoch": 0.00362,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.010538225993514061,
      "kl": 1.1300342498579994,
      "learning_rate": 7.999968851989192e-06,
      "loss": 0.0,
      "num_tokens": 9494689.0,
      "reward": 2.297545909881592,
      "reward_std": 0.3879827558994293,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.98197340965271,
      "rewards/probe_shaping_dominance/std": 0.10197389870882034,
      "rewards/probe_terminal_raw/mean": 0.015625,
      "rewards/probe_terminal_raw/std": 0.0883883461356163,
      "rewards/rollout_reward_func/mean": -0.5250524282455444,
      "rewards/rollout_reward_func/std": 0.19950900971889496,
      "sampling/importance_sampling_ratio/max": 1.4360560178756714,
      "sampling/importance_sampling_ratio/mean": 0.9875404834747314,
      "sampling/importance_sampling_ratio/min": 0.18539370596408844,
      "sampling/sampling_logp_difference/max": 1.6852741241455078,
      "sampling/sampling_logp_difference/mean": 0.049665287137031555,
      "step": 181,
      "step_time": 26.69406858900038
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.02083333395421505,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.033333334140479565,
      "entropy": 0.11389242531731725,
      "epoch": 0.00364,
      "grad_norm": 0.003970544785261154,
      "kl": 0.6293696188367903,
      "learning_rate": 7.999968420880736e-06,
      "loss": 0.0,
      "step": 182,
      "step_time": 12.197639549000996
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.9375,
      "completions/mean_terminated_length": 2.9375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.09248453052714467,
      "epoch": 0.00366,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0045459093526005745,
      "kl": 0.13789485239249188,
      "learning_rate": 7.99996798680936e-06,
      "loss": -0.0001,
      "num_tokens": 9599380.0,
      "reward": 2.4226768016815186,
      "reward_std": 0.3249405324459076,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.9375,
      "rewards/probe_completion_length/std": 0.24593468010425568,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9769073724746704,
      "rewards/probe_shaping_dominance/std": 0.09389247745275497,
      "rewards/probe_terminal_raw/mean": 0.024263210594654083,
      "rewards/probe_terminal_raw/std": 0.09960746020078659,
      "rewards/rollout_reward_func/mean": -0.4659937620162964,
      "rewards/rollout_reward_func/std": 0.19758032262325287,
      "sampling/importance_sampling_ratio/max": 1.1653671264648438,
      "sampling/importance_sampling_ratio/mean": 0.9370558261871338,
      "sampling/importance_sampling_ratio/min": 0.46233388781547546,
      "sampling/sampling_logp_difference/max": 0.7714686393737793,
      "sampling/sampling_logp_difference/mean": 0.038370583206415176,
      "step": 183,
      "step_time": 26.904572651000308
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.0933451559394598,
      "epoch": 0.00368,
      "grad_norm": 0.004598891828209162,
      "kl": 0.12106670817593113,
      "learning_rate": 7.999967549775057e-06,
      "loss": -0.0001,
      "step": 184,
      "step_time": 11.607436572001461
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.15387224033474922,
      "epoch": 0.0037,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.010245956480503082,
      "kl": 0.5030446688178927,
      "learning_rate": 7.999967109777834e-06,
      "loss": -0.0,
      "num_tokens": 9707382.0,
      "reward": 2.4315314292907715,
      "reward_std": 0.47317853569984436,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.0625,
      "rewards/probe_invalid_count/std": 0.24593468010425568,
      "rewards/probe_shaping_dominance/mean": 0.9855233430862427,
      "rewards/probe_shaping_dominance/std": 0.0818924754858017,
      "rewards/probe_terminal_raw/mean": 0.015625,
      "rewards/probe_terminal_raw/std": 0.0883883461356163,
      "rewards/rollout_reward_func/mean": -0.4883667528629303,
      "rewards/rollout_reward_func/std": 0.20319455862045288,
      "sampling/importance_sampling_ratio/max": 1.2542879581451416,
      "sampling/importance_sampling_ratio/mean": 0.9586943984031677,
      "sampling/importance_sampling_ratio/min": 0.3715563118457794,
      "sampling/sampling_logp_difference/max": 0.9900554418563843,
      "sampling/sampling_logp_difference/mean": 0.04447564482688904,
      "step": 185,
      "step_time": 27.28605421100019
    },
    {
      "clip_ratio/high_max": 0.02500000037252903,
      "clip_ratio/high_mean": 0.012500000186264515,
      "clip_ratio/low_mean": 0.056250001303851604,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.06875000149011612,
      "entropy": 0.15163114294409752,
      "epoch": 0.00372,
      "grad_norm": 0.0044283876195549965,
      "kl": 0.7128359689377248,
      "learning_rate": 7.999966666817687e-06,
      "loss": -0.0,
      "step": 186,
      "step_time": 12.221499876998678
    },
    {
      "clip_ratio/high_max": 0.04583333432674408,
      "clip_ratio/high_mean": 0.02291666716337204,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.033333334140479565,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.84375,
      "completions/mean_terminated_length": 2.84375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.1375539805740118,
      "epoch": 0.00374,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.006928480230271816,
      "kl": 0.14416655764216557,
      "learning_rate": 7.999966220894617e-06,
      "loss": -0.0,
      "num_tokens": 9814422.0,
      "reward": 2.40926456451416,
      "reward_std": 0.47349250316619873,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.84375,
      "rewards/probe_completion_length/std": 0.3689020276069641,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9848357439041138,
      "rewards/probe_shaping_dominance/std": 0.08578190207481384,
      "rewards/probe_terminal_raw/mean": 0.015625,
      "rewards/probe_terminal_raw/std": 0.0883883461356163,
      "rewards/rollout_reward_func/mean": -0.3849461078643799,
      "rewards/rollout_reward_func/std": 0.28888392448425293,
      "sampling/importance_sampling_ratio/max": 1.243560791015625,
      "sampling/importance_sampling_ratio/mean": 0.9681116342544556,
      "sampling/importance_sampling_ratio/min": 0.665830671787262,
      "sampling/sampling_logp_difference/max": 0.37914347648620605,
      "sampling/sampling_logp_difference/mean": 0.03084658458828926,
      "step": 187,
      "step_time": 28.931869071998335
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.13996880408376455,
      "epoch": 0.00376,
      "grad_norm": 0.009369016624987125,
      "kl": 0.15229893615469337,
      "learning_rate": 7.999965772008627e-06,
      "loss": -0.0,
      "step": 188,
      "step_time": 11.766830096999001
    },
    {
      "clip_ratio/high_max": 0.04583333432674408,
      "clip_ratio/high_mean": 0.02291666716337204,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.035416667349636555,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.9375,
      "completions/mean_terminated_length": 2.9375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.10358174092834815,
      "epoch": 0.00378,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.03559152036905289,
      "kl": 0.39252137734001735,
      "learning_rate": 7.999965320159715e-06,
      "loss": 0.0,
      "num_tokens": 9914246.0,
      "reward": 2.483328342437744,
      "reward_std": 0.3890749216079712,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.9375,
      "rewards/probe_completion_length/std": 0.24593468010425568,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.950668215751648,
      "rewards/probe_shaping_dominance/std": 0.1378525346517563,
      "rewards/probe_terminal_raw/mean": 0.056783534586429596,
      "rewards/probe_terminal_raw/std": 0.15526829659938812,
      "rewards/rollout_reward_func/mean": -0.44287341833114624,
      "rewards/rollout_reward_func/std": 0.26299041509628296,
      "sampling/importance_sampling_ratio/max": 1.2944039106369019,
      "sampling/importance_sampling_ratio/mean": 0.9779493808746338,
      "sampling/importance_sampling_ratio/min": 0.5075531005859375,
      "sampling/sampling_logp_difference/max": 0.6781981587409973,
      "sampling/sampling_logp_difference/mean": 0.026978708803653717,
      "step": 189,
      "step_time": 27.036868832000437
    },
    {
      "clip_ratio/high_max": 0.02500000037252903,
      "clip_ratio/high_mean": 0.012500000186264515,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.012500000186264515,
      "entropy": 0.10481282410910353,
      "epoch": 0.0038,
      "grad_norm": 0.0055263713002204895,
      "kl": 0.39047255569312256,
      "learning_rate": 7.999964865347883e-06,
      "loss": 0.0001,
      "step": 190,
      "step_time": 11.940458628999295
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.8125,
      "completions/mean_terminated_length": 2.8125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.1198381851427257,
      "epoch": 0.00382,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0030459309928119183,
      "kl": 0.34787876208429225,
      "learning_rate": 7.999964407573131e-06,
      "loss": 0.0,
      "num_tokens": 10017338.0,
      "reward": 2.2820868492126465,
      "reward_std": 0.4749685525894165,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.8125,
      "rewards/probe_completion_length/std": 0.3965577781200409,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.966022253036499,
      "rewards/probe_shaping_dominance/std": 0.13379566371440887,
      "rewards/probe_terminal_raw/mean": 0.03125,
      "rewards/probe_terminal_raw/std": 0.12296734005212784,
      "rewards/rollout_reward_func/mean": -0.47768545150756836,
      "rewards/rollout_reward_func/std": 0.2837761640548706,
      "sampling/importance_sampling_ratio/max": 1.7857273817062378,
      "sampling/importance_sampling_ratio/mean": 1.0156748294830322,
      "sampling/importance_sampling_ratio/min": 0.514444887638092,
      "sampling/sampling_logp_difference/max": 0.6646687984466553,
      "sampling/sampling_logp_difference/mean": 0.03443087264895439,
      "step": 191,
      "step_time": 27.4936487089999
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.12180041195824742,
      "epoch": 0.00384,
      "grad_norm": 0.009600832127034664,
      "kl": 0.3496675969581702,
      "learning_rate": 7.999963946835458e-06,
      "loss": 0.0,
      "step": 192,
      "step_time": 11.71437842100022
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.02083333395421505,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.031250000931322575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.96875,
      "completions/mean_terminated_length": 2.96875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.07277914439328015,
      "epoch": 0.00386,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.005412722937762737,
      "kl": 0.6871760921980012,
      "learning_rate": 7.999963483134866e-06,
      "loss": 0.0001,
      "num_tokens": 10123551.0,
      "reward": 2.4312024116516113,
      "reward_std": 0.31741824746131897,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.96875,
      "rewards/probe_completion_length/std": 0.1767766922712326,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.954460859298706,
      "rewards/probe_shaping_dominance/std": 0.1439775824546814,
      "rewards/probe_terminal_raw/mean": 0.046875,
      "rewards/probe_terminal_raw/std": 0.1480722874403,
      "rewards/rollout_reward_func/mean": -0.48888325691223145,
      "rewards/rollout_reward_func/std": 0.2712078392505646,
      "sampling/importance_sampling_ratio/max": 1.8100159168243408,
      "sampling/importance_sampling_ratio/mean": 1.0015695095062256,
      "sampling/importance_sampling_ratio/min": 0.4417291283607483,
      "sampling/sampling_logp_difference/max": 0.817058801651001,
      "sampling/sampling_logp_difference/mean": 0.03453746810555458,
      "step": 193,
      "step_time": 26.960456193001846
    },
    {
      "clip_ratio/high_max": 0.06250000186264515,
      "clip_ratio/high_mean": 0.031250000931322575,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0416666679084301,
      "entropy": 0.07823239883873612,
      "epoch": 0.00388,
      "grad_norm": 0.01935429498553276,
      "kl": 0.6307496229807157,
      "learning_rate": 7.999963016471355e-06,
      "loss": 0.0001,
      "step": 194,
      "step_time": 12.808481609999944
    },
    {
      "clip_ratio/high_max": 0.02500000037252903,
      "clip_ratio/high_mean": 0.012500000186264515,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02500000037252903,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.09379608882591128,
      "epoch": 0.0039,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0030668650288134813,
      "kl": 0.2814688477665186,
      "learning_rate": 7.999962546844924e-06,
      "loss": 0.0001,
      "num_tokens": 10225590.0,
      "reward": 2.361347198486328,
      "reward_std": 0.32310429215431213,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9668688178062439,
      "rewards/probe_shaping_dominance/std": 0.13289061188697815,
      "rewards/probe_terminal_raw/mean": 0.028328251093626022,
      "rewards/probe_terminal_raw/std": 0.11210102587938309,
      "rewards/rollout_reward_func/mean": -0.49010002613067627,
      "rewards/rollout_reward_func/std": 0.24613085389137268,
      "sampling/importance_sampling_ratio/max": 1.3004266023635864,
      "sampling/importance_sampling_ratio/mean": 0.9684375524520874,
      "sampling/importance_sampling_ratio/min": 0.5094537734985352,
      "sampling/sampling_logp_difference/max": 0.6744171380996704,
      "sampling/sampling_logp_difference/mean": 0.028151309117674828,
      "step": 195,
      "step_time": 25.599357043000964
    },
    {
      "clip_ratio/high_max": 0.04583333432674408,
      "clip_ratio/high_mean": 0.02291666716337204,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02291666716337204,
      "entropy": 0.09113423456437886,
      "epoch": 0.00392,
      "grad_norm": 0.003771732561290264,
      "kl": 0.27635849734906515,
      "learning_rate": 7.999962074255578e-06,
      "loss": 0.0001,
      "step": 196,
      "step_time": 11.204666337999697
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.04920864764972066,
      "epoch": 0.00394,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0027844668366014957,
      "kl": 0.3839081407932099,
      "learning_rate": 7.999961598703312e-06,
      "loss": -0.0,
      "num_tokens": 10330063.0,
      "reward": 2.415410041809082,
      "reward_std": 0.4154632091522217,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9844269156455994,
      "rewards/probe_shaping_dominance/std": 0.08809469640254974,
      "rewards/probe_terminal_raw/mean": 0.015625,
      "rewards/probe_terminal_raw/std": 0.0883883461356163,
      "rewards/rollout_reward_func/mean": -0.44089192152023315,
      "rewards/rollout_reward_func/std": 0.2551630139350891,
      "sampling/importance_sampling_ratio/max": 1.1653680801391602,
      "sampling/importance_sampling_ratio/mean": 0.9744973182678223,
      "sampling/importance_sampling_ratio/min": 0.20111165940761566,
      "sampling/sampling_logp_difference/max": 1.6039009094238281,
      "sampling/sampling_logp_difference/mean": 0.030936850234866142,
      "step": 197,
      "step_time": 26.98998093100181
    },
    {
      "clip_ratio/high_max": 0.02500000037252903,
      "clip_ratio/high_mean": 0.012500000186264515,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02291666716337204,
      "entropy": 0.0472417699656944,
      "epoch": 0.00396,
      "grad_norm": 0.0009492259123362601,
      "kl": 0.3996036083844956,
      "learning_rate": 7.99996112018813e-06,
      "loss": -0.0,
      "step": 198,
      "step_time": 12.02342930299983
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.96875,
      "completions/mean_terminated_length": 2.96875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.06072818394750357,
      "epoch": 0.00398,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0025375511031597853,
      "kl": 0.2914491758947406,
      "learning_rate": 7.999960638710032e-06,
      "loss": 0.0,
      "num_tokens": 10431419.0,
      "reward": 2.499394178390503,
      "reward_std": 0.29632288217544556,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.96875,
      "rewards/probe_completion_length/std": 0.1767766922712326,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9628201127052307,
      "rewards/probe_shaping_dominance/std": 0.12069481611251831,
      "rewards/probe_terminal_raw/mean": 0.04026930779218674,
      "rewards/probe_terminal_raw/std": 0.13092826306819916,
      "rewards/rollout_reward_func/mean": -0.42244523763656616,
      "rewards/rollout_reward_func/std": 0.24739933013916016,
      "sampling/importance_sampling_ratio/max": 1.3507100343704224,
      "sampling/importance_sampling_ratio/mean": 1.0147151947021484,
      "sampling/importance_sampling_ratio/min": 0.9091832637786865,
      "sampling/sampling_logp_difference/max": 0.338870108127594,
      "sampling/sampling_logp_difference/mean": 0.010294873267412186,
      "step": 199,
      "step_time": 27.086743224999736
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.058446566108614206,
      "epoch": 0.004,
      "grad_norm": 0.0024834321811795235,
      "kl": 0.2936624846115592,
      "learning_rate": 7.999960154269017e-06,
      "loss": 0.0,
      "step": 200,
      "step_time": 11.463394613998389
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.9375,
      "completions/mean_terminated_length": 2.9375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.09987628925591707,
      "epoch": 0.00402,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0032795185688883066,
      "kl": 0.33637799334246665,
      "learning_rate": 7.999959666865086e-06,
      "loss": -0.0,
      "num_tokens": 10533498.0,
      "reward": 2.4651217460632324,
      "reward_std": 0.32078394293785095,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.9375,
      "rewards/probe_completion_length/std": 0.24593468010425568,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9926146268844604,
      "rewards/probe_shaping_dominance/std": 0.04177792742848396,
      "rewards/probe_terminal_raw/mean": 0.008003048598766327,
      "rewards/probe_terminal_raw/std": 0.04527207836508751,
      "rewards/rollout_reward_func/mean": -0.4229958653450012,
      "rewards/rollout_reward_func/std": 0.19672146439552307,
      "sampling/importance_sampling_ratio/max": 1.195106863975525,
      "sampling/importance_sampling_ratio/mean": 0.9418940544128418,
      "sampling/importance_sampling_ratio/min": 0.318993479013443,
      "sampling/sampling_logp_difference/max": 0.9258831739425659,
      "sampling/sampling_logp_difference/mean": 0.038004204630851746,
      "step": 201,
      "step_time": 26.624555751001026
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.10252866102382541,
      "epoch": 0.00404,
      "grad_norm": 0.0035051219165325165,
      "kl": 0.3395325805176981,
      "learning_rate": 7.99995917649824e-06,
      "loss": -0.0,
      "step": 202,
      "step_time": 12.736442242999146
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.96875,
      "completions/mean_terminated_length": 2.96875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.10787439718842506,
      "epoch": 0.00406,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.00344108697026968,
      "kl": 0.40611333276319783,
      "learning_rate": 7.999958683168479e-06,
      "loss": 0.0,
      "num_tokens": 10637062.0,
      "reward": 2.5038881301879883,
      "reward_std": 0.22744759917259216,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.96875,
      "rewards/probe_completion_length/std": 0.1767766922712326,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -0.4148617684841156,
      "rewards/rollout_reward_func/std": 0.18833006918430328,
      "sampling/importance_sampling_ratio/max": 1.1548116207122803,
      "sampling/importance_sampling_ratio/mean": 0.9753589630126953,
      "sampling/importance_sampling_ratio/min": 0.7033773064613342,
      "sampling/sampling_logp_difference/max": 0.35186219215393066,
      "sampling/sampling_logp_difference/mean": 0.019522543996572495,
      "step": 203,
      "step_time": 26.715982574999543
    },
    {
      "clip_ratio/high_max": 0.0416666679084301,
      "clip_ratio/high_mean": 0.02083333395421505,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.031250000931322575,
      "entropy": 0.1049330742098391,
      "epoch": 0.00408,
      "grad_norm": 0.0019796311389654875,
      "kl": 0.4593061124905944,
      "learning_rate": 7.999958186875805e-06,
      "loss": -0.0,
      "step": 204,
      "step_time": 11.646448757999678
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.055373367242282256,
      "epoch": 0.0041,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.006926523055881262,
      "kl": 0.05150494979155518,
      "learning_rate": 7.999957687620215e-06,
      "loss": -0.0,
      "num_tokens": 10738428.0,
      "reward": 2.550138473510742,
      "reward_std": 0.22538912296295166,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 2.0,
      "rewards/probe_completion_length/std": 0.0,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9910282492637634,
      "rewards/probe_shaping_dominance/std": 0.05075191706418991,
      "rewards/probe_terminal_raw/mean": 0.00940040685236454,
      "rewards/probe_terminal_raw/std": 0.05317673459649086,
      "rewards/rollout_reward_func/mean": -0.4002901315689087,
      "rewards/rollout_reward_func/std": 0.22546610236167908,
      "sampling/importance_sampling_ratio/max": 1.2517437934875488,
      "sampling/importance_sampling_ratio/mean": 0.9799097180366516,
      "sampling/importance_sampling_ratio/min": 0.5997620224952698,
      "sampling/sampling_logp_difference/max": 0.5112212896347046,
      "sampling/sampling_logp_difference/mean": 0.01782449334859848,
      "step": 205,
      "step_time": 26.197072295999533
    },
    {
      "clip_ratio/high_max": 0.0416666679084301,
      "clip_ratio/high_mean": 0.02083333395421505,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "entropy": 0.06316551179043017,
      "epoch": 0.00412,
      "grad_norm": 0.0017257543513551354,
      "kl": 0.053950335964449536,
      "learning_rate": 7.999957185401714e-06,
      "loss": -0.0,
      "step": 206,
      "step_time": 12.549622151999756
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.02291666716337204,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.033333334140479565,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.84375,
      "completions/mean_terminated_length": 2.84375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.11197321023792028,
      "epoch": 0.00414,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.031097499653697014,
      "kl": 3.89400917571038,
      "learning_rate": 7.9999566802203e-06,
      "loss": 0.0001,
      "num_tokens": 10840689.0,
      "reward": 2.345735549926758,
      "reward_std": 0.5137441754341125,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.84375,
      "rewards/probe_completion_length/std": 0.3689020276069641,
      "rewards/probe_invalid_count/mean": 0.0625,
      "rewards/probe_invalid_count/std": 0.24593468010425568,
      "rewards/probe_shaping_dominance/mean": 0.9725180268287659,
      "rewards/probe_shaping_dominance/std": 0.10821773111820221,
      "rewards/probe_terminal_raw/mean": 0.03125,
      "rewards/probe_terminal_raw/std": 0.12296734005212784,
      "rewards/rollout_reward_func/mean": -0.5142826437950134,
      "rewards/rollout_reward_func/std": 0.19152681529521942,
      "sampling/importance_sampling_ratio/max": 1.9924818277359009,
      "sampling/importance_sampling_ratio/mean": 0.9943416118621826,
      "sampling/importance_sampling_ratio/min": 0.39203470945358276,
      "sampling/sampling_logp_difference/max": 0.9361467361450195,
      "sampling/sampling_logp_difference/mean": 0.053312450647354126,
      "step": 207,
      "step_time": 26.643122880999726
    },
    {
      "clip_ratio/high_max": 0.04583333432674408,
      "clip_ratio/high_mean": 0.02291666716337204,
      "clip_ratio/low_mean": 0.035416667349636555,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.058333334513008595,
      "entropy": 0.11182145914062858,
      "epoch": 0.00416,
      "grad_norm": 0.007240073289722204,
      "kl": 1.6443076208233833,
      "learning_rate": 7.999956172075974e-06,
      "loss": 0.0,
      "step": 208,
      "step_time": 11.64378536300228
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.12927352613769472,
      "epoch": 0.00418,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.004229975864291191,
      "kl": 0.6016647743063004,
      "learning_rate": 7.999955660968735e-06,
      "loss": -0.0,
      "num_tokens": 10944113.0,
      "reward": 2.364624261856079,
      "reward_std": 0.36824679374694824,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -0.49162572622299194,
      "rewards/rollout_reward_func/std": 0.21871674060821533,
      "sampling/importance_sampling_ratio/max": 1.3223011493682861,
      "sampling/importance_sampling_ratio/mean": 0.9632259607315063,
      "sampling/importance_sampling_ratio/min": 0.3602616786956787,
      "sampling/sampling_logp_difference/max": 0.6850378513336182,
      "sampling/sampling_logp_difference/mean": 0.04301746189594269,
      "step": 209,
      "step_time": 26.264724693000062
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.12698473082855344,
      "epoch": 0.0042,
      "grad_norm": 0.004611098673194647,
      "kl": 0.6409582832593514,
      "learning_rate": 7.999955146898586e-06,
      "loss": -0.0001,
      "step": 210,
      "step_time": 12.728916892999223
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.04340869339648634,
      "epoch": 0.00422,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0008402821840718389,
      "kl": 0.035792879805057964,
      "learning_rate": 7.999954629865525e-06,
      "loss": -0.0,
      "num_tokens": 11047946.0,
      "reward": 2.3281283378601074,
      "reward_std": 0.43589621782302856,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9857661724090576,
      "rewards/probe_shaping_dominance/std": 0.08051877468824387,
      "rewards/probe_terminal_raw/mean": 0.015625,
      "rewards/probe_terminal_raw/std": 0.0883883461356163,
      "rewards/rollout_reward_func/mean": -0.4982629418373108,
      "rewards/rollout_reward_func/std": 0.20851053297519684,
      "sampling/importance_sampling_ratio/max": 1.0012203454971313,
      "sampling/importance_sampling_ratio/mean": 0.9677799940109253,
      "sampling/importance_sampling_ratio/min": 0.4670157730579376,
      "sampling/sampling_logp_difference/max": 0.7613925933837891,
      "sampling/sampling_logp_difference/mean": 0.014532409608364105,
      "step": 211,
      "step_time": 26.491312149000805
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.04452452051918954,
      "epoch": 0.00424,
      "grad_norm": 0.0009245733381249011,
      "kl": 0.039327465879523515,
      "learning_rate": 7.999954109869554e-06,
      "loss": -0.0,
      "step": 212,
      "step_time": 11.690953868999713
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.84375,
      "completions/mean_terminated_length": 2.84375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.10158436209894717,
      "epoch": 0.00426,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0037761996500194073,
      "kl": 0.43266808055341244,
      "learning_rate": 7.999953586910674e-06,
      "loss": -0.0,
      "num_tokens": 11155145.0,
      "reward": 2.33209490776062,
      "reward_std": 0.3974522352218628,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.84375,
      "rewards/probe_completion_length/std": 0.3689020276069641,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9518005847930908,
      "rewards/probe_shaping_dominance/std": 0.15248610079288483,
      "rewards/probe_terminal_raw/mean": 0.046875,
      "rewards/probe_terminal_raw/std": 0.1480722874403,
      "rewards/rollout_reward_func/mean": -0.4603305459022522,
      "rewards/rollout_reward_func/std": 0.2795467674732208,
      "sampling/importance_sampling_ratio/max": 1.5568536520004272,
      "sampling/importance_sampling_ratio/mean": 1.0121254920959473,
      "sampling/importance_sampling_ratio/min": 0.6084503531455994,
      "sampling/sampling_logp_difference/max": 0.49602431058883667,
      "sampling/sampling_logp_difference/mean": 0.017653338611125946,
      "step": 213,
      "step_time": 26.773649626000406
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.10326709412038326,
      "epoch": 0.00428,
      "grad_norm": 0.004299989901483059,
      "kl": 0.4246340822428465,
      "learning_rate": 7.999953060988884e-06,
      "loss": 0.0,
      "step": 214,
      "step_time": 12.393828191000466
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.10672931908629835,
      "epoch": 0.0043,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0029812948778271675,
      "kl": 0.5036190063692629,
      "learning_rate": 7.999952532104185e-06,
      "loss": 0.0,
      "num_tokens": 11256499.0,
      "reward": 2.3668174743652344,
      "reward_std": 0.4220028221607208,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.984410285949707,
      "rewards/probe_shaping_dominance/std": 0.08818867057561874,
      "rewards/probe_terminal_raw/mean": 0.015625,
      "rewards/probe_terminal_raw/std": 0.0883883461356163,
      "rewards/rollout_reward_func/mean": -0.458217978477478,
      "rewards/rollout_reward_func/std": 0.1993042230606079,
      "sampling/importance_sampling_ratio/max": 1.2048288583755493,
      "sampling/importance_sampling_ratio/mean": 0.9700103998184204,
      "sampling/importance_sampling_ratio/min": 0.2804865837097168,
      "sampling/sampling_logp_difference/max": 1.2170777320861816,
      "sampling/sampling_logp_difference/mean": 0.027440235018730164,
      "step": 215,
      "step_time": 26.241349470000387
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.10663612652570009,
      "epoch": 0.00432,
      "grad_norm": 0.0025962339714169502,
      "kl": 0.514960631611757,
      "learning_rate": 7.99995200025658e-06,
      "loss": 0.0,
      "step": 216,
      "step_time": 11.455195212000945
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.9375,
      "completions/mean_terminated_length": 2.9375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.13120519556105137,
      "epoch": 0.00434,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.00685320096090436,
      "kl": 0.5306107758951839,
      "learning_rate": 7.999951465446065e-06,
      "loss": 0.0,
      "num_tokens": 11358760.0,
      "reward": 2.4137301445007324,
      "reward_std": 0.38182157278060913,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.9375,
      "rewards/probe_completion_length/std": 0.24593468010425568,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9719860553741455,
      "rewards/probe_shaping_dominance/std": 0.1105431467294693,
      "rewards/probe_terminal_raw/mean": 0.03125,
      "rewards/probe_terminal_raw/std": 0.12296734005212784,
      "rewards/rollout_reward_func/mean": -0.4770059883594513,
      "rewards/rollout_reward_func/std": 0.27089011669158936,
      "sampling/importance_sampling_ratio/max": 1.8946123123168945,
      "sampling/importance_sampling_ratio/mean": 1.0106232166290283,
      "sampling/importance_sampling_ratio/min": 0.6873172521591187,
      "sampling/sampling_logp_difference/max": 0.6602880954742432,
      "sampling/sampling_logp_difference/mean": 0.026765936985611916,
      "step": 217,
      "step_time": 28.19653884499894
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.1328302058391273,
      "epoch": 0.00436,
      "grad_norm": 0.006467514205724001,
      "kl": 0.5236879177391529,
      "learning_rate": 7.999950927672645e-06,
      "loss": 0.0,
      "step": 218,
      "step_time": 11.548230411000986
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.96875,
      "completions/mean_terminated_length": 2.96875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.0558876832947135,
      "epoch": 0.00438,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.002873801626265049,
      "kl": 0.43705418131622764,
      "learning_rate": 7.999950386936317e-06,
      "loss": 0.0001,
      "num_tokens": 11459134.0,
      "reward": 2.4926953315734863,
      "reward_std": 0.2576614320278168,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.96875,
      "rewards/probe_completion_length/std": 0.1767766922712326,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9532788395881653,
      "rewards/probe_shaping_dominance/std": 0.1259964108467102,
      "rewards/probe_terminal_raw/mean": 0.049288615584373474,
      "rewards/probe_terminal_raw/std": 0.13439743220806122,
      "rewards/rollout_reward_func/mean": -0.4286222755908966,
      "rewards/rollout_reward_func/std": 0.13808076083660126,
      "sampling/importance_sampling_ratio/max": 2.167020320892334,
      "sampling/importance_sampling_ratio/mean": 1.0488494634628296,
      "sampling/importance_sampling_ratio/min": 0.5981054306030273,
      "sampling/sampling_logp_difference/max": 0.773352861404419,
      "sampling/sampling_logp_difference/mean": 0.021742573007941246,
      "step": 219,
      "step_time": 26.59079552000003
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.05215576570481062,
      "epoch": 0.0044,
      "grad_norm": 0.013386573642492294,
      "kl": 0.4328960892962641,
      "learning_rate": 7.999949843237083e-06,
      "loss": 0.0001,
      "step": 220,
      "step_time": 11.575578054999824
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.02083333395421505,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.10125815495848656,
      "epoch": 0.00442,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.003916088026016951,
      "kl": 0.22720737754934817,
      "learning_rate": 7.999949296574944e-06,
      "loss": 0.0,
      "num_tokens": 11564110.0,
      "reward": 2.5024495124816895,
      "reward_std": 0.21472422778606415,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 2.0,
      "rewards/probe_completion_length/std": 0.0,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9687739014625549,
      "rewards/probe_shaping_dominance/std": 0.12289554625749588,
      "rewards/probe_terminal_raw/mean": 0.03125,
      "rewards/probe_terminal_raw/std": 0.12296734005212784,
      "rewards/rollout_reward_func/mean": -0.4475744962692261,
      "rewards/rollout_reward_func/std": 0.21471136808395386,
      "sampling/importance_sampling_ratio/max": 1.2565096616744995,
      "sampling/importance_sampling_ratio/mean": 0.9851142168045044,
      "sampling/importance_sampling_ratio/min": 0.7785980701446533,
      "sampling/sampling_logp_difference/max": 0.25026071071624756,
      "sampling/sampling_logp_difference/mean": 0.014336168766021729,
      "step": 221,
      "step_time": 28.308517722000943
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.10386610007844865,
      "epoch": 0.00444,
      "grad_norm": 0.0038715400733053684,
      "kl": 0.2309217918664217,
      "learning_rate": 7.9999487469499e-06,
      "loss": 0.0,
      "step": 222,
      "step_time": 11.59164219199829
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.9375,
      "completions/mean_terminated_length": 2.9375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.08839935716241598,
      "epoch": 0.00446,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0031392991077154875,
      "kl": 0.3969584498627228,
      "learning_rate": 7.999948194361951e-06,
      "loss": 0.0,
      "num_tokens": 11670791.0,
      "reward": 2.504007339477539,
      "reward_std": 0.40813401341438293,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.9375,
      "rewards/probe_completion_length/std": 0.24593468010425568,
      "rewards/probe_invalid_count/mean": 0.0625,
      "rewards/probe_invalid_count/std": 0.24593468010425568,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -0.4459925591945648,
      "rewards/rollout_reward_func/std": 0.22923637926578522,
      "sampling/importance_sampling_ratio/max": 1.2424126863479614,
      "sampling/importance_sampling_ratio/mean": 1.0054875612258911,
      "sampling/importance_sampling_ratio/min": 0.8022926449775696,
      "sampling/sampling_logp_difference/max": 0.2571254372596741,
      "sampling/sampling_logp_difference/mean": 0.01522812806069851,
      "step": 223,
      "step_time": 27.01184939599989
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "entropy": 0.08901654137298465,
      "epoch": 0.00448,
      "grad_norm": 0.0026675413828343153,
      "kl": 0.3970091380215308,
      "learning_rate": 7.999947638811098e-06,
      "loss": 0.0,
      "step": 224,
      "step_time": 12.880684480999662
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.02083333395421505,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.031250000931322575,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.96875,
      "completions/mean_terminated_length": 2.96875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.06860345043241978,
      "epoch": 0.0045,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.005898882634937763,
      "kl": 0.2994147054851055,
      "learning_rate": 7.999947080297344e-06,
      "loss": 0.0001,
      "num_tokens": 11778059.0,
      "reward": 2.442521095275879,
      "reward_std": 0.44092267751693726,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.96875,
      "rewards/probe_completion_length/std": 0.1767766922712326,
      "rewards/probe_invalid_count/mean": 0.0625,
      "rewards/probe_invalid_count/std": 0.3535533845424652,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -0.5387288331985474,
      "rewards/rollout_reward_func/std": 0.17624567449092865,
      "sampling/importance_sampling_ratio/max": 1.9132263660430908,
      "sampling/importance_sampling_ratio/mean": 1.0267926454544067,
      "sampling/importance_sampling_ratio/min": 0.2760489583015442,
      "sampling/sampling_logp_difference/max": 1.2855275869369507,
      "sampling/sampling_logp_difference/mean": 0.03292452543973923,
      "step": 225,
      "step_time": 26.894577987999583
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.02083333395421505,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.031250000931322575,
      "entropy": 0.0715375836007297,
      "epoch": 0.00452,
      "grad_norm": 0.004127421882003546,
      "kl": 0.2991956745972857,
      "learning_rate": 7.999946518820686e-06,
      "loss": 0.0001,
      "step": 226,
      "step_time": 11.7451522450001
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.07084862189367414,
      "epoch": 0.00454,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.007534612435847521,
      "kl": 0.3083134523330955,
      "learning_rate": 7.999945954381125e-06,
      "loss": -0.0,
      "num_tokens": 11885416.0,
      "reward": 2.2896175384521484,
      "reward_std": 0.4199885129928589,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9857305884361267,
      "rewards/probe_shaping_dominance/std": 0.080719955265522,
      "rewards/probe_terminal_raw/mean": 0.015625,
      "rewards/probe_terminal_raw/std": 0.0883883461356163,
      "rewards/rollout_reward_func/mean": -0.5367380380630493,
      "rewards/rollout_reward_func/std": 0.2644577920436859,
      "sampling/importance_sampling_ratio/max": 1.2167645692825317,
      "sampling/importance_sampling_ratio/mean": 0.9729256629943848,
      "sampling/importance_sampling_ratio/min": 0.5702285766601562,
      "sampling/sampling_logp_difference/max": 0.556563138961792,
      "sampling/sampling_logp_difference/mean": 0.01854308322072029,
      "step": 227,
      "step_time": 26.478597906999312
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.07101618498563766,
      "epoch": 0.00456,
      "grad_norm": 0.005244475323706865,
      "kl": 0.275350460462505,
      "learning_rate": 7.999945386978663e-06,
      "loss": -0.0,
      "step": 228,
      "step_time": 12.815234450999014
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.96875,
      "completions/mean_terminated_length": 2.96875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.10753743472741917,
      "epoch": 0.00458,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.002793548395857215,
      "kl": 0.3363812413687519,
      "learning_rate": 7.999944816613299e-06,
      "loss": 0.0,
      "num_tokens": 11990346.0,
      "reward": 2.4647884368896484,
      "reward_std": 0.3218696117401123,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.96875,
      "rewards/probe_completion_length/std": 0.1767766922712326,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9882341623306274,
      "rewards/probe_shaping_dominance/std": 0.06655776500701904,
      "rewards/probe_terminal_raw/mean": 0.011559959501028061,
      "rewards/probe_terminal_raw/std": 0.06539300829172134,
      "rewards/rollout_reward_func/mean": -0.45375561714172363,
      "rewards/rollout_reward_func/std": 0.26721474528312683,
      "sampling/importance_sampling_ratio/max": 1.7522544860839844,
      "sampling/importance_sampling_ratio/mean": 1.0056817531585693,
      "sampling/importance_sampling_ratio/min": 0.39151322841644287,
      "sampling/sampling_logp_difference/max": 0.9377517700195312,
      "sampling/sampling_logp_difference/mean": 0.030310627073049545,
      "step": 229,
      "step_time": 26.652824122999846
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "entropy": 0.10178712871856987,
      "epoch": 0.0046,
      "grad_norm": 0.0023058054503053427,
      "kl": 0.3472972925131521,
      "learning_rate": 7.999944243285035e-06,
      "loss": 0.0,
      "step": 230,
      "step_time": 11.641791465999631
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.012500000186264515,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.84375,
      "completions/mean_terminated_length": 2.84375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.11396907176822424,
      "epoch": 0.00462,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0053448486141860485,
      "kl": 0.23751085135154426,
      "learning_rate": 7.999943666993872e-06,
      "loss": -0.0,
      "num_tokens": 12094123.0,
      "reward": 2.3231983184814453,
      "reward_std": 0.4537913501262665,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.84375,
      "rewards/probe_completion_length/std": 0.3689020276069641,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9484962224960327,
      "rewards/probe_shaping_dominance/std": 0.14161793887615204,
      "rewards/probe_terminal_raw/mean": 0.05538617819547653,
      "rewards/probe_terminal_raw/std": 0.15303537249565125,
      "rewards/rollout_reward_func/mean": -0.4744342267513275,
      "rewards/rollout_reward_func/std": 0.27888038754463196,
      "sampling/importance_sampling_ratio/max": 1.2306643724441528,
      "sampling/importance_sampling_ratio/mean": 0.9789013862609863,
      "sampling/importance_sampling_ratio/min": 0.5588669180870056,
      "sampling/sampling_logp_difference/max": 0.5087692737579346,
      "sampling/sampling_logp_difference/mean": 0.027260489761829376,
      "step": 231,
      "step_time": 27.108853302998796
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.11494649667292833,
      "epoch": 0.00464,
      "grad_norm": 0.0034225336275994778,
      "kl": 0.2446515706833452,
      "learning_rate": 7.999943087739808e-06,
      "loss": -0.0,
      "step": 232,
      "step_time": 12.437156906999007
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.8125,
      "completions/mean_terminated_length": 2.8125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.09780422016046941,
      "epoch": 0.00466,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.00331493909470737,
      "kl": 0.29221273493021727,
      "learning_rate": 7.999942505522845e-06,
      "loss": 0.0,
      "num_tokens": 12202392.0,
      "reward": 2.31793212890625,
      "reward_std": 0.4711916446685791,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.8125,
      "rewards/probe_completion_length/std": 0.3965577781200409,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9865642786026001,
      "rewards/probe_shaping_dominance/std": 0.07600414007902145,
      "rewards/probe_terminal_raw/mean": 0.015625,
      "rewards/probe_terminal_raw/std": 0.0883883461356163,
      "rewards/rollout_reward_func/mean": -0.44675713777542114,
      "rewards/rollout_reward_func/std": 0.27934086322784424,
      "sampling/importance_sampling_ratio/max": 1.2045822143554688,
      "sampling/importance_sampling_ratio/mean": 0.9702666997909546,
      "sampling/importance_sampling_ratio/min": 0.5390675067901611,
      "sampling/sampling_logp_difference/max": 0.6179147958755493,
      "sampling/sampling_logp_difference/mean": 0.02464653179049492,
      "step": 233,
      "step_time": 27.07101158400019
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.10010742908343673,
      "epoch": 0.00468,
      "grad_norm": 0.00394394900649786,
      "kl": 0.28515962581150234,
      "learning_rate": 7.999941920342986e-06,
      "loss": 0.0,
      "step": 234,
      "step_time": 11.908877233997373
    },
    {
      "clip_ratio/high_max": 0.04583333432674408,
      "clip_ratio/high_mean": 0.02291666716337204,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.033333334140479565,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.09357268398161978,
      "epoch": 0.0047,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.003001472679898143,
      "kl": 0.4120303535989933,
      "learning_rate": 7.999941332200228e-06,
      "loss": 0.0,
      "num_tokens": 12307473.0,
      "reward": 2.356600761413574,
      "reward_std": 0.39092886447906494,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9900838732719421,
      "rewards/probe_shaping_dominance/std": 0.05609414726495743,
      "rewards/probe_terminal_raw/mean": 0.015625,
      "rewards/probe_terminal_raw/std": 0.0883883461356163,
      "rewards/rollout_reward_func/mean": -0.47410792112350464,
      "rewards/rollout_reward_func/std": 0.2651825547218323,
      "sampling/importance_sampling_ratio/max": 1.2125083208084106,
      "sampling/importance_sampling_ratio/mean": 0.9483182430267334,
      "sampling/importance_sampling_ratio/min": 0.5642846822738647,
      "sampling/sampling_logp_difference/max": 0.5796399116516113,
      "sampling/sampling_logp_difference/mean": 0.029487669467926025,
      "step": 235,
      "step_time": 27.473293748998913
    },
    {
      "clip_ratio/high_max": 0.06666666828095913,
      "clip_ratio/high_mean": 0.033333334140479565,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.033333334140479565,
      "entropy": 0.08885149616980925,
      "epoch": 0.00472,
      "grad_norm": 0.004106747917830944,
      "kl": 0.39987785345859805,
      "learning_rate": 7.999940741094573e-06,
      "loss": 0.0,
      "step": 236,
      "step_time": 11.607714889999443
    },
    {
      "clip_ratio/high_max": 0.0416666679084301,
      "clip_ratio/high_mean": 0.02083333395421505,
      "clip_ratio/low_mean": 0.02083333395421505,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0416666679084301,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.0790116679854691,
      "epoch": 0.00474,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0017510356847196817,
      "kl": 0.49183082331728656,
      "learning_rate": 7.999940147026021e-06,
      "loss": 0.0,
      "num_tokens": 12410261.0,
      "reward": 2.362030029296875,
      "reward_std": 0.48628348112106323,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9550326466560364,
      "rewards/probe_shaping_dominance/std": 0.1423776000738144,
      "rewards/probe_terminal_raw/mean": 0.046875,
      "rewards/probe_terminal_raw/std": 0.1480722874403,
      "rewards/rollout_reward_func/mean": -0.464877724647522,
      "rewards/rollout_reward_func/std": 0.2927810847759247,
      "sampling/importance_sampling_ratio/max": 1.2767555713653564,
      "sampling/importance_sampling_ratio/mean": 1.0007102489471436,
      "sampling/importance_sampling_ratio/min": 0.5674677491188049,
      "sampling/sampling_logp_difference/max": 0.564541220664978,
      "sampling/sampling_logp_difference/mean": 0.017719101160764694,
      "step": 237,
      "step_time": 26.277223889999732
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.02083333395421505,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.031250000931322575,
      "entropy": 0.08091688924469054,
      "epoch": 0.00476,
      "grad_norm": 0.0037676175124943256,
      "kl": 0.4987390860915184,
      "learning_rate": 7.999939549994574e-06,
      "loss": 0.0,
      "step": 238,
      "step_time": 11.42589379400033
    },
    {
      "clip_ratio/high_max": 0.03125,
      "clip_ratio/high_mean": 0.015625,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.015625,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.08572797977831215,
      "epoch": 0.00478,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0028349068015813828,
      "kl": 0.29074460588162765,
      "learning_rate": 7.99993895000023e-06,
      "loss": -0.0001,
      "num_tokens": 12515046.0,
      "reward": 2.3852663040161133,
      "reward_std": 0.48509836196899414,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9448926448822021,
      "rewards/probe_shaping_dominance/std": 0.15274296700954437,
      "rewards/probe_terminal_raw/mean": 0.05525914579629898,
      "rewards/probe_terminal_raw/std": 0.15285103023052216,
      "rewards/rollout_reward_func/mean": -0.43988555669784546,
      "rewards/rollout_reward_func/std": 0.28072717785835266,
      "sampling/importance_sampling_ratio/max": 1.2809064388275146,
      "sampling/importance_sampling_ratio/mean": 0.9681559801101685,
      "sampling/importance_sampling_ratio/min": 0.417494535446167,
      "sampling/sampling_logp_difference/max": 0.8734843134880066,
      "sampling/sampling_logp_difference/mean": 0.02679057978093624,
      "step": 239,
      "step_time": 27.850705083998037
    },
    {
      "clip_ratio/high_max": 0.05208333395421505,
      "clip_ratio/high_mean": 0.026041666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.026041666977107525,
      "entropy": 0.09145444841124117,
      "epoch": 0.0048,
      "grad_norm": 0.003533316310495138,
      "kl": 0.276357589289546,
      "learning_rate": 7.999938347042993e-06,
      "loss": -0.0001,
      "step": 240,
      "step_time": 11.650785684000766
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.05321495997486636,
      "epoch": 0.00482,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.002123113488778472,
      "kl": 0.1996255109550784,
      "learning_rate": 7.999937741122862e-06,
      "loss": 0.0,
      "num_tokens": 12618608.0,
      "reward": 2.31355619430542,
      "reward_std": 0.3297788202762604,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -0.5426939129829407,
      "rewards/rollout_reward_func/std": 0.22457517683506012,
      "sampling/importance_sampling_ratio/max": 1.1050293445587158,
      "sampling/importance_sampling_ratio/mean": 1.0058460235595703,
      "sampling/importance_sampling_ratio/min": 0.9022819995880127,
      "sampling/sampling_logp_difference/max": 0.10648787021636963,
      "sampling/sampling_logp_difference/mean": 0.005735831335186958,
      "step": 241,
      "step_time": 26.73763404300007
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.05486724083311856,
      "epoch": 0.00484,
      "grad_norm": 0.003093272214755416,
      "kl": 0.1941228064047955,
      "learning_rate": 7.999937132239836e-06,
      "loss": 0.0,
      "step": 242,
      "step_time": 11.670754389999274
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.07577884336933494,
      "epoch": 0.00486,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0036162908654659986,
      "kl": 0.4399729967590247,
      "learning_rate": 7.999936520393918e-06,
      "loss": 0.0,
      "num_tokens": 12726447.0,
      "reward": 2.3645379543304443,
      "reward_std": 0.41120022535324097,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9753913879394531,
      "rewards/probe_shaping_dominance/std": 0.09771986305713654,
      "rewards/probe_terminal_raw/mean": 0.0260416679084301,
      "rewards/probe_terminal_raw/std": 0.1046360433101654,
      "rewards/rollout_reward_func/mean": -0.4618951678276062,
      "rewards/rollout_reward_func/std": 0.1977241188287735,
      "sampling/importance_sampling_ratio/max": 1.1149406433105469,
      "sampling/importance_sampling_ratio/mean": 0.9780128002166748,
      "sampling/importance_sampling_ratio/min": 0.7354345321655273,
      "sampling/sampling_logp_difference/max": 0.18633489310741425,
      "sampling/sampling_logp_difference/mean": 0.013524588197469711,
      "step": 243,
      "step_time": 27.977090622001015
    },
    {
      "clip_ratio/high_max": 0.04583333432674408,
      "clip_ratio/high_mean": 0.02291666716337204,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.033333334140479565,
      "entropy": 0.07133703003637493,
      "epoch": 0.00488,
      "grad_norm": 0.002898427424952388,
      "kl": 0.44227540418796707,
      "learning_rate": 7.999935905585108e-06,
      "loss": 0.0,
      "step": 244,
      "step_time": 11.75723793999805
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.0315001527142158,
      "epoch": 0.0049,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.001392417005263269,
      "kl": 0.23886053822934628,
      "learning_rate": 7.999935287813407e-06,
      "loss": -0.0,
      "num_tokens": 12827575.0,
      "reward": 2.4073498249053955,
      "reward_std": 0.42101356387138367,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9554626941680908,
      "rewards/probe_shaping_dominance/std": 0.14310474693775177,
      "rewards/probe_terminal_raw/mean": 0.046875,
      "rewards/probe_terminal_raw/std": 0.1480722874403,
      "rewards/rollout_reward_func/mean": -0.4199880063533783,
      "rewards/rollout_reward_func/std": 0.2148957997560501,
      "sampling/importance_sampling_ratio/max": 1.0394365787506104,
      "sampling/importance_sampling_ratio/mean": 0.995591402053833,
      "sampling/importance_sampling_ratio/min": 0.8603565096855164,
      "sampling/sampling_logp_difference/max": 0.1303640604019165,
      "sampling/sampling_logp_difference/mean": 0.004159946460276842,
      "step": 245,
      "step_time": 26.077412141000423
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.032327667491699685,
      "epoch": 0.00492,
      "grad_norm": 0.0010727684712037444,
      "kl": 0.23855953469561797,
      "learning_rate": 7.999934667078813e-06,
      "loss": -0.0,
      "step": 246,
      "step_time": 11.513740063000114
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.0816163292620331,
      "epoch": 0.00494,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0027896249666810036,
      "kl": 0.4679242782876827,
      "learning_rate": 7.999934043381328e-06,
      "loss": 0.0,
      "num_tokens": 12935730.0,
      "reward": 2.46283221244812,
      "reward_std": 0.36876291036605835,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9373108148574829,
      "rewards/probe_shaping_dominance/std": 0.1728522628545761,
      "rewards/probe_terminal_raw/mean": 0.0625,
      "rewards/probe_terminal_raw/std": 0.16800537705421448,
      "rewards/rollout_reward_func/mean": -0.3932287096977234,
      "rewards/rollout_reward_func/std": 0.24200834333896637,
      "sampling/importance_sampling_ratio/max": 1.2427064180374146,
      "sampling/importance_sampling_ratio/mean": 1.0063412189483643,
      "sampling/importance_sampling_ratio/min": 0.8085158467292786,
      "sampling/sampling_logp_difference/max": 0.21965795755386353,
      "sampling/sampling_logp_difference/mean": 0.01280665211379528,
      "step": 247,
      "step_time": 28.041720137000084
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.08206989825703204,
      "epoch": 0.00496,
      "grad_norm": 0.00293480372056365,
      "kl": 0.46830739825963974,
      "learning_rate": 7.999933416720957e-06,
      "loss": 0.0,
      "step": 248,
      "step_time": 11.713867525000751
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.06533269377541728,
      "epoch": 0.00498,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.003347411984577775,
      "kl": 0.36843465792230745,
      "learning_rate": 7.999932787097692e-06,
      "loss": 0.0001,
      "num_tokens": 13041381.0,
      "reward": 2.382171630859375,
      "reward_std": 0.4231238067150116,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.9549021124839783,
      "rewards/probe_shaping_dominance/std": 0.14646287262439728,
      "rewards/probe_terminal_raw/mean": 0.04255589470267296,
      "rewards/probe_terminal_raw/std": 0.13594815135002136,
      "rewards/rollout_reward_func/mean": -0.50278639793396,
      "rewards/rollout_reward_func/std": 0.27676716446876526,
      "sampling/importance_sampling_ratio/max": 1.3422638177871704,
      "sampling/importance_sampling_ratio/mean": 0.9941832423210144,
      "sampling/importance_sampling_ratio/min": 0.6115661263465881,
      "sampling/sampling_logp_difference/max": 0.4917324185371399,
      "sampling/sampling_logp_difference/mean": 0.018511097878217697,
      "step": 249,
      "step_time": 26.64282015000026
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.07043309864820912,
      "epoch": 0.005,
      "grad_norm": 0.0035562312696129084,
      "kl": 0.359963540629451,
      "learning_rate": 7.999932154511542e-06,
      "loss": 0.0,
      "step": 250,
      "step_time": 11.727345789000537
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.08174855704419315,
      "epoch": 0.00502,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.003543607424944639,
      "kl": 0.5413316028789268,
      "learning_rate": 7.999931518962502e-06,
      "loss": 0.0,
      "num_tokens": 13146021.0,
      "reward": 2.4559497833251953,
      "reward_std": 0.3885264992713928,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.9621438384056091,
      "rewards/probe_shaping_dominance/std": 0.1238301619887352,
      "rewards/probe_terminal_raw/mean": 0.03963414579629898,
      "rewards/probe_terminal_raw/std": 0.12972840666770935,
      "rewards/rollout_reward_func/mean": -0.40207818150520325,
      "rewards/rollout_reward_func/std": 0.2555524408817291,
      "sampling/importance_sampling_ratio/max": 1.1064826250076294,
      "sampling/importance_sampling_ratio/mean": 0.954660177230835,
      "sampling/importance_sampling_ratio/min": 0.41962218284606934,
      "sampling/sampling_logp_difference/max": 0.7979011535644531,
      "sampling/sampling_logp_difference/mean": 0.023729108273983,
      "step": 251,
      "step_time": 27.992850227999952
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.07990033202804625,
      "epoch": 0.00504,
      "grad_norm": 0.003231135895475745,
      "kl": 0.524783481414488,
      "learning_rate": 7.999930880450575e-06,
      "loss": 0.0,
      "step": 252,
      "step_time": 11.643585757999972
    },
    {
      "clip_ratio/high_max": 0.04583333432674408,
      "clip_ratio/high_mean": 0.02291666716337204,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02291666716337204,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.07902092937729321,
      "epoch": 0.00506,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.006585233379155397,
      "kl": 0.37969694038247326,
      "learning_rate": 7.99993023897576e-06,
      "loss": 0.0,
      "num_tokens": 13246298.0,
      "reward": 2.4005722999572754,
      "reward_std": 0.3679780662059784,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.974242091178894,
      "rewards/probe_shaping_dominance/std": 0.10219167917966843,
      "rewards/probe_terminal_raw/mean": 0.026549797505140305,
      "rewards/probe_terminal_raw/std": 0.10620416700839996,
      "rewards/rollout_reward_func/mean": -0.42521971464157104,
      "rewards/rollout_reward_func/std": 0.21645236015319824,
      "sampling/importance_sampling_ratio/max": 1.969668984413147,
      "sampling/importance_sampling_ratio/mean": 1.0500105619430542,
      "sampling/importance_sampling_ratio/min": 0.7689392566680908,
      "sampling/sampling_logp_difference/max": 0.6780328750610352,
      "sampling/sampling_logp_difference/mean": 0.02139047347009182,
      "step": 253,
      "step_time": 26.232431414999155
    },
    {
      "clip_ratio/high_max": 0.04583333432674408,
      "clip_ratio/high_mean": 0.02291666716337204,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02291666716337204,
      "entropy": 0.07854951097397134,
      "epoch": 0.00508,
      "grad_norm": 0.005968212615698576,
      "kl": 0.3778405386647137,
      "learning_rate": 7.99992959453806e-06,
      "loss": 0.0,
      "step": 254,
      "step_time": 12.017544923999594
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.84375,
      "completions/mean_terminated_length": 2.84375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.04191483659815276,
      "epoch": 0.0051,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.004378916695713997,
      "kl": 0.3174490866222186,
      "learning_rate": 7.999928947137475e-06,
      "loss": -0.0,
      "num_tokens": 13351235.0,
      "reward": 2.3821582794189453,
      "reward_std": 0.4624309539794922,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.84375,
      "rewards/probe_completion_length/std": 0.3689020276069641,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9869383573532104,
      "rewards/probe_shaping_dominance/std": 0.07388784736394882,
      "rewards/probe_terminal_raw/mean": 0.015625,
      "rewards/probe_terminal_raw/std": 0.0883883461356163,
      "rewards/rollout_reward_func/mean": -0.41415512561798096,
      "rewards/rollout_reward_func/std": 0.23873184621334076,
      "sampling/importance_sampling_ratio/max": 1.257253885269165,
      "sampling/importance_sampling_ratio/mean": 1.011238932609558,
      "sampling/importance_sampling_ratio/min": 0.9685202836990356,
      "sampling/sampling_logp_difference/max": 0.2289290428161621,
      "sampling/sampling_logp_difference/mean": 0.005532183218747377,
      "step": 255,
      "step_time": 28.14665811800114
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.044428632616472896,
      "epoch": 0.00512,
      "grad_norm": 0.001523565617389977,
      "kl": 0.3174588828405831,
      "learning_rate": 7.999928296774006e-06,
      "loss": -0.0,
      "step": 256,
      "step_time": 11.396023698001045
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.8125,
      "completions/mean_terminated_length": 2.8125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.11287707928568125,
      "epoch": 0.00514,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0049528395757079124,
      "kl": 0.3751811153779272,
      "learning_rate": 7.999927643447652e-06,
      "loss": -0.0001,
      "num_tokens": 13453732.0,
      "reward": 2.2990427017211914,
      "reward_std": 0.4729869067668915,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.8125,
      "rewards/probe_completion_length/std": 0.3965577781200409,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.8994619250297546,
      "rewards/probe_shaping_dominance/std": 0.23433801531791687,
      "rewards/probe_terminal_raw/mean": 0.08892276883125305,
      "rewards/probe_terminal_raw/std": 0.1897670477628708,
      "rewards/rollout_reward_func/mean": -0.451841801404953,
      "rewards/rollout_reward_func/std": 0.3020572066307068,
      "sampling/importance_sampling_ratio/max": 1.7735323905944824,
      "sampling/importance_sampling_ratio/mean": 1.0311025381088257,
      "sampling/importance_sampling_ratio/min": 0.48170769214630127,
      "sampling/sampling_logp_difference/max": 0.5872056484222412,
      "sampling/sampling_logp_difference/mean": 0.03187928348779678,
      "step": 257,
      "step_time": 27.428863920001277
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.11026706825941801,
      "epoch": 0.00516,
      "grad_norm": 0.0036789914593100548,
      "kl": 0.37549637774645817,
      "learning_rate": 7.999926987158413e-06,
      "loss": -0.0001,
      "step": 258,
      "step_time": 12.307902244997422
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.09494227101095021,
      "epoch": 0.00518,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.004995207767933607,
      "kl": 0.5894506504137098,
      "learning_rate": 7.999926327906292e-06,
      "loss": 0.0,
      "num_tokens": 13559320.0,
      "reward": 2.3814258575439453,
      "reward_std": 0.36968865990638733,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9921875,
      "rewards/probe_shaping_dominance/std": 0.04419417306780815,
      "rewards/probe_terminal_raw/mean": 0.015625,
      "rewards/probe_terminal_raw/std": 0.0883883461356163,
      "rewards/rollout_reward_func/mean": -0.4826367497444153,
      "rewards/rollout_reward_func/std": 0.231715127825737,
      "sampling/importance_sampling_ratio/max": 1.2988759279251099,
      "sampling/importance_sampling_ratio/mean": 0.989588737487793,
      "sampling/importance_sampling_ratio/min": 0.3728586435317993,
      "sampling/sampling_logp_difference/max": 0.9864900708198547,
      "sampling/sampling_logp_difference/mean": 0.030208630487322807,
      "step": 259,
      "step_time": 28.526762178002173
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "entropy": 0.09542630659416318,
      "epoch": 0.0052,
      "grad_norm": 0.009572784416377544,
      "kl": 0.5865388629335939,
      "learning_rate": 7.999925665691289e-06,
      "loss": 0.0,
      "step": 260,
      "step_time": 11.52395996999985
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.042740301505546086,
      "epoch": 0.00522,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0034757580142468214,
      "kl": 0.16234587341508444,
      "learning_rate": 7.999925000513405e-06,
      "loss": 0.0001,
      "num_tokens": 13662277.0,
      "reward": 2.3550405502319336,
      "reward_std": 0.3789060413837433,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9763131737709045,
      "rewards/probe_shaping_dominance/std": 0.09356633573770523,
      "rewards/probe_terminal_raw/mean": 0.023119919002056122,
      "rewards/probe_terminal_raw/std": 0.0910695344209671,
      "rewards/rollout_reward_func/mean": -0.4693926274776459,
      "rewards/rollout_reward_func/std": 0.27393800020217896,
      "sampling/importance_sampling_ratio/max": 1.9132373332977295,
      "sampling/importance_sampling_ratio/mean": 1.0334219932556152,
      "sampling/importance_sampling_ratio/min": 0.8748363256454468,
      "sampling/sampling_logp_difference/max": 0.648794412612915,
      "sampling/sampling_logp_difference/mean": 0.015361637808382511,
      "step": 261,
      "step_time": 27.68740953500128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.043997991022479255,
      "epoch": 0.00524,
      "grad_norm": 0.0034889201633632183,
      "kl": 0.1585660980490502,
      "learning_rate": 7.999924332372639e-06,
      "loss": 0.0,
      "step": 262,
      "step_time": 12.369422526000562
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.9375,
      "completions/mean_terminated_length": 2.9375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.06730000481184106,
      "epoch": 0.00526,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0019632691983133554,
      "kl": 0.2906430190632818,
      "learning_rate": 7.999923661268994e-06,
      "loss": -0.0,
      "num_tokens": 13768535.0,
      "reward": 2.461604356765747,
      "reward_std": 0.28569555282592773,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.9375,
      "rewards/probe_completion_length/std": 0.24593468010425568,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9790951013565063,
      "rewards/probe_shaping_dominance/std": 0.08612176775932312,
      "rewards/probe_terminal_raw/mean": 0.023373983800411224,
      "rewards/probe_terminal_raw/std": 0.09738598018884659,
      "rewards/rollout_reward_func/mean": -0.42836469411849976,
      "rewards/rollout_reward_func/std": 0.21179892122745514,
      "sampling/importance_sampling_ratio/max": 1.027362585067749,
      "sampling/importance_sampling_ratio/mean": 0.911888837814331,
      "sampling/importance_sampling_ratio/min": 0.0,
      "sampling/sampling_logp_difference/max": 1.2283318042755127,
      "sampling/sampling_logp_difference/mean": 0.04068940505385399,
      "step": 263,
      "step_time": 28.125288621000436
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.0668873688664462,
      "epoch": 0.00528,
      "grad_norm": 0.0020422539673745632,
      "kl": 0.30596065653662663,
      "learning_rate": 7.999922987202466e-06,
      "loss": -0.0,
      "step": 264,
      "step_time": 11.507015873000455
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.9375,
      "completions/mean_terminated_length": 2.9375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.05058241146616638,
      "epoch": 0.0053,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0018712878227233887,
      "kl": 0.39055716490838677,
      "learning_rate": 7.999922310173063e-06,
      "loss": -0.0,
      "num_tokens": 13871840.0,
      "reward": 2.4825406074523926,
      "reward_std": 0.31064870953559875,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.9375,
      "rewards/probe_completion_length/std": 0.24593468010425568,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9752524495124817,
      "rewards/probe_shaping_dominance/std": 0.09777678549289703,
      "rewards/probe_terminal_raw/mean": 0.03125,
      "rewards/probe_terminal_raw/std": 0.12296734005212784,
      "rewards/rollout_reward_func/mean": -0.41146183013916016,
      "rewards/rollout_reward_func/std": 0.21425116062164307,
      "sampling/importance_sampling_ratio/max": 1.5599281787872314,
      "sampling/importance_sampling_ratio/mean": 1.0341243743896484,
      "sampling/importance_sampling_ratio/min": 0.8953186869621277,
      "sampling/sampling_logp_difference/max": 0.4449194669723511,
      "sampling/sampling_logp_difference/mean": 0.013410702347755432,
      "step": 265,
      "step_time": 27.96838706700055
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.04936367901973426,
      "epoch": 0.00532,
      "grad_norm": 0.006141372956335545,
      "kl": 0.3867563092110231,
      "learning_rate": 7.99992163018078e-06,
      "loss": -0.0,
      "step": 266,
      "step_time": 12.308435358998395
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.84375,
      "completions/mean_terminated_length": 2.84375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.05740413888270268,
      "epoch": 0.00534,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0028442663606256247,
      "kl": 0.3010439347126521,
      "learning_rate": 7.99992094722562e-06,
      "loss": -0.0,
      "num_tokens": 13974703.0,
      "reward": 2.375330924987793,
      "reward_std": 0.3971181809902191,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.84375,
      "rewards/probe_completion_length/std": 0.3689020276069641,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9330692291259766,
      "rewards/probe_shaping_dominance/std": 0.15932095050811768,
      "rewards/probe_terminal_raw/mean": 0.06885162740945816,
      "rewards/probe_terminal_raw/std": 0.1653386801481247,
      "rewards/rollout_reward_func/mean": -0.42034000158309937,
      "rewards/rollout_reward_func/std": 0.19739177823066711,
      "sampling/importance_sampling_ratio/max": 1.2114074230194092,
      "sampling/importance_sampling_ratio/mean": 0.9802918434143066,
      "sampling/importance_sampling_ratio/min": 0.3451912999153137,
      "sampling/sampling_logp_difference/max": 1.0613338947296143,
      "sampling/sampling_logp_difference/mean": 0.018370507284998894,
      "step": 267,
      "step_time": 27.86067632400045
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "entropy": 0.055183965210744645,
      "epoch": 0.00536,
      "grad_norm": 0.0022630670573562384,
      "kl": 0.344313826324651,
      "learning_rate": 7.999920261307583e-06,
      "loss": -0.0,
      "step": 268,
      "step_time": 11.746586444000059
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.08427908451994881,
      "epoch": 0.00538,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0037011466920375824,
      "kl": 0.412635525688529,
      "learning_rate": 7.999919572426668e-06,
      "loss": -0.0,
      "num_tokens": 14078089.0,
      "reward": 2.4167308807373047,
      "reward_std": 0.32326242327690125,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9451819658279419,
      "rewards/probe_shaping_dominance/std": 0.147642120718956,
      "rewards/probe_terminal_raw/mean": 0.05843495950102806,
      "rewards/probe_terminal_raw/std": 0.15837596356868744,
      "rewards/rollout_reward_func/mean": -0.44313597679138184,
      "rewards/rollout_reward_func/std": 0.24654169380664825,
      "sampling/importance_sampling_ratio/max": 1.858984112739563,
      "sampling/importance_sampling_ratio/mean": 0.9879124164581299,
      "sampling/importance_sampling_ratio/min": 0.6056866645812988,
      "sampling/sampling_logp_difference/max": 0.6200296878814697,
      "sampling/sampling_logp_difference/mean": 0.027817152440547943,
      "step": 269,
      "step_time": 26.451382616000046
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.07716414582682773,
      "epoch": 0.0054,
      "grad_norm": 0.0030677285976707935,
      "kl": 0.4153696422581561,
      "learning_rate": 7.999918880582879e-06,
      "loss": -0.0,
      "step": 270,
      "step_time": 12.785874016998605
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.96875,
      "completions/mean_terminated_length": 2.96875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.04053633386229194,
      "epoch": 0.00542,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.001796143944375217,
      "kl": 0.5015344847925007,
      "learning_rate": 7.999918185776215e-06,
      "loss": 0.0,
      "num_tokens": 14181503.0,
      "reward": 2.4646096229553223,
      "reward_std": 0.2045918107032776,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.96875,
      "rewards/probe_completion_length/std": 0.1767766922712326,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9679263234138489,
      "rewards/probe_shaping_dominance/std": 0.1049569845199585,
      "rewards/probe_terminal_raw/mean": 0.0364583358168602,
      "rewards/probe_terminal_raw/std": 0.11773227155208588,
      "rewards/rollout_reward_func/mean": -0.4585248529911041,
      "rewards/rollout_reward_func/std": 0.16162419319152832,
      "sampling/importance_sampling_ratio/max": 1.4571605920791626,
      "sampling/importance_sampling_ratio/mean": 1.0197436809539795,
      "sampling/importance_sampling_ratio/min": 0.8846800923347473,
      "sampling/sampling_logp_difference/max": 0.3764890432357788,
      "sampling/sampling_logp_difference/mean": 0.012306122109293938,
      "step": 271,
      "step_time": 26.693239825001
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.038741875116102165,
      "epoch": 0.00544,
      "grad_norm": 0.0020677302964031696,
      "kl": 0.5029990994371474,
      "learning_rate": 7.999917488006676e-06,
      "loss": 0.0,
      "step": 272,
      "step_time": 11.444299719997616
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.04818115712259896,
      "epoch": 0.00546,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.008343451656401157,
      "kl": 0.7089566249400381,
      "learning_rate": 7.999916787274264e-06,
      "loss": 0.0001,
      "num_tokens": 14287480.0,
      "reward": 2.4599452018737793,
      "reward_std": 0.38899266719818115,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9238950610160828,
      "rewards/probe_shaping_dominance/std": 0.16443566977977753,
      "rewards/probe_terminal_raw/mean": 0.08130080997943878,
      "rewards/probe_terminal_raw/std": 0.17714287340641022,
      "rewards/rollout_reward_func/mean": -0.3702506721019745,
      "rewards/rollout_reward_func/std": 0.21257071197032928,
      "sampling/importance_sampling_ratio/max": 2.423100471496582,
      "sampling/importance_sampling_ratio/mean": 1.0725514888763428,
      "sampling/importance_sampling_ratio/min": 0.8080363273620605,
      "sampling/sampling_logp_difference/max": 0.8850466012954712,
      "sampling/sampling_logp_difference/mean": 0.024975256994366646,
      "step": 273,
      "step_time": 28.09797250900101
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02291666716337204,
      "entropy": 0.04680645616099355,
      "epoch": 0.00548,
      "grad_norm": 0.003927062265574932,
      "kl": 0.742738697305322,
      "learning_rate": 7.99991608357898e-06,
      "loss": 0.0001,
      "step": 274,
      "step_time": 11.650237371000003
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.84375,
      "completions/mean_terminated_length": 2.84375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.04380835813935846,
      "epoch": 0.0055,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0025579470675438643,
      "kl": 0.21995878049926887,
      "learning_rate": 7.999915376920822e-06,
      "loss": -0.0,
      "num_tokens": 14387389.0,
      "reward": 2.2633914947509766,
      "reward_std": 0.42217421531677246,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.84375,
      "rewards/probe_completion_length/std": 0.3689020276069641,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9556913375854492,
      "rewards/probe_shaping_dominance/std": 0.1223374605178833,
      "rewards/probe_terminal_raw/mean": 0.0518292672932148,
      "rewards/probe_terminal_raw/std": 0.14265993237495422,
      "rewards/rollout_reward_func/mean": -0.5378788709640503,
      "rewards/rollout_reward_func/std": 0.23384462296962738,
      "sampling/importance_sampling_ratio/max": 1.084592580795288,
      "sampling/importance_sampling_ratio/mean": 0.9922658205032349,
      "sampling/importance_sampling_ratio/min": 0.7613502740859985,
      "sampling/sampling_logp_difference/max": 0.2726619839668274,
      "sampling/sampling_logp_difference/mean": 0.009103155694901943,
      "step": 275,
      "step_time": 26.459266137000668
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.03994250175310299,
      "epoch": 0.00552,
      "grad_norm": 0.0021381748374551535,
      "kl": 0.2157795349397702,
      "learning_rate": 7.999914667299794e-06,
      "loss": -0.0,
      "step": 276,
      "step_time": 11.672075437000785
    },
    {
      "clip_ratio/high_max": 0.05000000074505806,
      "clip_ratio/high_mean": 0.02500000037252903,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.037500000558793545,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.8125,
      "completions/mean_terminated_length": 2.8125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.09868528880178928,
      "epoch": 0.00554,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0031571455765515566,
      "kl": 0.4792258571833372,
      "learning_rate": 7.999913954715895e-06,
      "loss": 0.0,
      "num_tokens": 14492025.0,
      "reward": 2.2542710304260254,
      "reward_std": 0.38688531517982483,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.8125,
      "rewards/probe_completion_length/std": 0.3965577781200409,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -0.5082289576530457,
      "rewards/rollout_reward_func/std": 0.17395071685314178,
      "sampling/importance_sampling_ratio/max": 1.9612770080566406,
      "sampling/importance_sampling_ratio/mean": 1.0468454360961914,
      "sampling/importance_sampling_ratio/min": 0.5976178646087646,
      "sampling/sampling_logp_difference/max": 0.7003155946731567,
      "sampling/sampling_logp_difference/mean": 0.032625701278448105,
      "step": 277,
      "step_time": 27.236174976000257
    },
    {
      "clip_ratio/high_max": 0.05000000074505806,
      "clip_ratio/high_mean": 0.02500000037252903,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.037500000558793545,
      "entropy": 0.09623363520950079,
      "epoch": 0.00556,
      "grad_norm": 0.0032991948537528515,
      "kl": 0.4749853519606404,
      "learning_rate": 7.999913239169126e-06,
      "loss": 0.0,
      "step": 278,
      "step_time": 12.07052038799975
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.8125,
      "completions/mean_terminated_length": 2.8125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.04121039004530758,
      "epoch": 0.00558,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0032093473710119724,
      "kl": 0.6897661700841127,
      "learning_rate": 7.999912520659488e-06,
      "loss": 0.0,
      "num_tokens": 14593223.0,
      "reward": 2.3469300270080566,
      "reward_std": 0.5208548307418823,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.8125,
      "rewards/probe_completion_length/std": 0.3965577781200409,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.95980304479599,
      "rewards/probe_shaping_dominance/std": 0.12795211374759674,
      "rewards/probe_terminal_raw/mean": 0.042174797505140305,
      "rewards/probe_terminal_raw/std": 0.13503843545913696,
      "rewards/rollout_reward_func/mean": -0.44879791140556335,
      "rewards/rollout_reward_func/std": 0.2045743763446808,
      "sampling/importance_sampling_ratio/max": 1.9838464260101318,
      "sampling/importance_sampling_ratio/mean": 1.0156028270721436,
      "sampling/importance_sampling_ratio/min": 0.1315358281135559,
      "sampling/sampling_logp_difference/max": 2.028473377227783,
      "sampling/sampling_logp_difference/mean": 0.03758270666003227,
      "step": 279,
      "step_time": 26.44161211500159
    },
    {
      "clip_ratio/high_max": 0.04583333432674408,
      "clip_ratio/high_mean": 0.02291666716337204,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.033333334140479565,
      "entropy": 0.047012478462420404,
      "epoch": 0.0056,
      "grad_norm": 0.0013261314015835524,
      "kl": 0.7127395562856691,
      "learning_rate": 7.99991179918698e-06,
      "loss": -0.0,
      "step": 280,
      "step_time": 11.634762280001269
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.9375,
      "completions/mean_terminated_length": 2.9375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.012499196142016444,
      "epoch": 0.00562,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0003787693567574024,
      "kl": 0.669078703969717,
      "learning_rate": 7.999911074751606e-06,
      "loss": -0.0,
      "num_tokens": 14693012.0,
      "reward": 2.4939217567443848,
      "reward_std": 0.381552517414093,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.9375,
      "rewards/probe_completion_length/std": 0.24593468010425568,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9186484813690186,
      "rewards/probe_shaping_dominance/std": 0.195449098944664,
      "rewards/probe_terminal_raw/mean": 0.078125,
      "rewards/probe_terminal_raw/std": 0.18445101380348206,
      "rewards/rollout_reward_func/mean": -0.3903515338897705,
      "rewards/rollout_reward_func/std": 0.2618943452835083,
      "sampling/importance_sampling_ratio/max": 1.0298659801483154,
      "sampling/importance_sampling_ratio/mean": 0.9976564645767212,
      "sampling/importance_sampling_ratio/min": 0.9420029520988464,
      "sampling/sampling_logp_difference/max": 0.05974767729640007,
      "sampling/sampling_logp_difference/mean": 0.0016555668553337455,
      "step": 281,
      "step_time": 26.723270941998635
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.012372259192488855,
      "epoch": 0.00564,
      "grad_norm": 0.0003435203689150512,
      "kl": 0.6690934834768996,
      "learning_rate": 7.999910347353363e-06,
      "loss": -0.0,
      "step": 282,
      "step_time": 11.794659334002063
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.84375,
      "completions/mean_terminated_length": 2.84375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.045614961185492575,
      "epoch": 0.00566,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.003150342497974634,
      "kl": 0.48013901670856285,
      "learning_rate": 7.999909616992255e-06,
      "loss": -0.0,
      "num_tokens": 14799672.0,
      "reward": 2.3399429321289062,
      "reward_std": 0.4422038793563843,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.84375,
      "rewards/probe_completion_length/std": 0.3689020276069641,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 1.0,
      "rewards/probe_shaping_dominance/std": 0.0,
      "rewards/probe_terminal_raw/mean": 0.0,
      "rewards/probe_terminal_raw/std": 0.0,
      "rewards/rollout_reward_func/mean": -0.45380693674087524,
      "rewards/rollout_reward_func/std": 0.1835639625787735,
      "sampling/importance_sampling_ratio/max": 1.2092225551605225,
      "sampling/importance_sampling_ratio/mean": 0.9782531261444092,
      "sampling/importance_sampling_ratio/min": 0.3157159686088562,
      "sampling/sampling_logp_difference/max": 1.1528494358062744,
      "sampling/sampling_logp_difference/mean": 0.019979460164904594,
      "step": 283,
      "step_time": 27.036351400000058
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.03985181718599051,
      "epoch": 0.00568,
      "grad_norm": 0.0033008423633873463,
      "kl": 0.49970418894372415,
      "learning_rate": 7.99990888366828e-06,
      "loss": -0.0,
      "step": 284,
      "step_time": 11.668078124000203
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.96875,
      "completions/mean_terminated_length": 2.96875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.03972258236899506,
      "epoch": 0.0057,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.002630846342071891,
      "kl": 0.3517824411392212,
      "learning_rate": 7.99990814738144e-06,
      "loss": -0.0,
      "num_tokens": 14902831.0,
      "reward": 2.4359757900238037,
      "reward_std": 0.2911105751991272,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.96875,
      "rewards/probe_completion_length/std": 0.1767766922712326,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9871374368667603,
      "rewards/probe_shaping_dominance/std": 0.07276186347007751,
      "rewards/probe_terminal_raw/mean": 0.015625,
      "rewards/probe_terminal_raw/std": 0.0883883461356163,
      "rewards/rollout_reward_func/mean": -0.48553669452667236,
      "rewards/rollout_reward_func/std": 0.2099909633398056,
      "sampling/importance_sampling_ratio/max": 1.558259129524231,
      "sampling/importance_sampling_ratio/mean": 1.021366834640503,
      "sampling/importance_sampling_ratio/min": 0.757884681224823,
      "sampling/sampling_logp_difference/max": 0.4435689449310303,
      "sampling/sampling_logp_difference/mean": 0.011840267106890678,
      "step": 285,
      "step_time": 27.424052481000217
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.040461032156599686,
      "epoch": 0.00572,
      "grad_norm": 0.002737229922786355,
      "kl": 0.3537818659096956,
      "learning_rate": 7.999907408131737e-06,
      "loss": -0.0,
      "step": 286,
      "step_time": 12.126654321001297
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.96875,
      "completions/mean_terminated_length": 2.96875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.020485240605921717,
      "epoch": 0.00574,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.000876868492923677,
      "kl": 0.23688423214722576,
      "learning_rate": 7.999906665919169e-06,
      "loss": -0.0,
      "num_tokens": 15005261.0,
      "reward": 2.5098652839660645,
      "reward_std": 0.30707597732543945,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.96875,
      "rewards/probe_completion_length/std": 0.1767766922712326,
      "rewards/probe_invalid_count/mean": 0.03125,
      "rewards/probe_invalid_count/std": 0.1767766922712326,
      "rewards/probe_shaping_dominance/mean": 0.9733736515045166,
      "rewards/probe_shaping_dominance/std": 0.10744811594486237,
      "rewards/probe_terminal_raw/mean": 0.0260416679084301,
      "rewards/probe_terminal_raw/std": 0.1046360433101654,
      "rewards/rollout_reward_func/mean": -0.4395501911640167,
      "rewards/rollout_reward_func/std": 0.18828870356082916,
      "sampling/importance_sampling_ratio/max": 1.0840176343917847,
      "sampling/importance_sampling_ratio/mean": 1.0012118816375732,
      "sampling/importance_sampling_ratio/min": 0.9655031561851501,
      "sampling/sampling_logp_difference/max": 0.08256775140762329,
      "sampling/sampling_logp_difference/mean": 0.00235398905351758,
      "step": 287,
      "step_time": 27.075085327001034
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.022996263058303157,
      "epoch": 0.00576,
      "grad_norm": 0.0009354232461191714,
      "kl": 0.23660576696175895,
      "learning_rate": 7.99990592074374e-06,
      "loss": -0.0,
      "step": 288,
      "step_time": 11.657212093999078
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.05316271091851377,
      "epoch": 0.00578,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.006305683869868517,
      "kl": 0.2035164695232652,
      "learning_rate": 7.999905172605446e-06,
      "loss": -0.0001,
      "num_tokens": 15107252.0,
      "reward": 2.422664165496826,
      "reward_std": 0.37807923555374146,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9658713340759277,
      "rewards/probe_shaping_dominance/std": 0.10942408442497253,
      "rewards/probe_terminal_raw/mean": 0.03315548598766327,
      "rewards/probe_terminal_raw/std": 0.1095743477344513,
      "rewards/rollout_reward_func/mean": -0.40136268734931946,
      "rewards/rollout_reward_func/std": 0.2093636691570282,
      "sampling/importance_sampling_ratio/max": 1.5805177688598633,
      "sampling/importance_sampling_ratio/mean": 1.0220205783843994,
      "sampling/importance_sampling_ratio/min": 0.7326148748397827,
      "sampling/sampling_logp_difference/max": 0.4577510356903076,
      "sampling/sampling_logp_difference/mean": 0.019495096057653427,
      "step": 289,
      "step_time": 26.987616914999307
    },
    {
      "clip_ratio/high_max": 0.04583333432674408,
      "clip_ratio/high_mean": 0.02291666716337204,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02291666716337204,
      "entropy": 0.05469944020660478,
      "epoch": 0.0058,
      "grad_norm": 0.0032733359839767218,
      "kl": 0.18666235760611016,
      "learning_rate": 7.999904421504293e-06,
      "loss": -0.0001,
      "step": 290,
      "step_time": 11.951281235001261
    },
    {
      "clip_ratio/high_max": 0.02500000037252903,
      "clip_ratio/high_mean": 0.012500000186264515,
      "clip_ratio/low_mean": 0.02291666716337204,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.035416667349636555,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.0591709428122158,
      "epoch": 0.00582,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0026493356563150883,
      "kl": 0.8575776647776365,
      "learning_rate": 7.999903667440278e-06,
      "loss": 0.0,
      "num_tokens": 15208793.0,
      "reward": 2.402831792831421,
      "reward_std": 0.3910689353942871,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9471915364265442,
      "rewards/probe_shaping_dominance/std": 0.1431892067193985,
      "rewards/probe_terminal_raw/mean": 0.0520833358168602,
      "rewards/probe_terminal_raw/std": 0.1433027982711792,
      "rewards/rollout_reward_func/mean": -0.42144304513931274,
      "rewards/rollout_reward_func/std": 0.21596133708953857,
      "sampling/importance_sampling_ratio/max": 1.0310035943984985,
      "sampling/importance_sampling_ratio/mean": 0.9701290130615234,
      "sampling/importance_sampling_ratio/min": 0.5706773400306702,
      "sampling/sampling_logp_difference/max": 0.5609317421913147,
      "sampling/sampling_logp_difference/mean": 0.014490557834506035,
      "step": 291,
      "step_time": 27.075510762000704
    },
    {
      "clip_ratio/high_max": 0.02500000037252903,
      "clip_ratio/high_mean": 0.012500000186264515,
      "clip_ratio/low_mean": 0.012500000186264515,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02500000037252903,
      "entropy": 0.05919087287338698,
      "epoch": 0.00584,
      "grad_norm": 0.0026768911629915237,
      "kl": 0.8454538804168692,
      "learning_rate": 7.999902910413404e-06,
      "loss": 0.0,
      "step": 292,
      "step_time": 12.032383580999522
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.04045550918681329,
      "epoch": 0.00586,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.00724328076466918,
      "kl": 0.80053227301687,
      "learning_rate": 7.999902150423671e-06,
      "loss": -0.0001,
      "num_tokens": 15311233.0,
      "reward": 2.4362893104553223,
      "reward_std": 0.426661878824234,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9734768271446228,
      "rewards/probe_shaping_dominance/std": 0.10647083818912506,
      "rewards/probe_terminal_raw/mean": 0.026295732706785202,
      "rewards/probe_terminal_raw/std": 0.10541322082281113,
      "rewards/rollout_reward_func/mean": -0.38848331570625305,
      "rewards/rollout_reward_func/std": 0.2122591733932495,
      "sampling/importance_sampling_ratio/max": 1.8292688131332397,
      "sampling/importance_sampling_ratio/mean": 1.001596212387085,
      "sampling/importance_sampling_ratio/min": 0.44141146540641785,
      "sampling/sampling_logp_difference/max": 0.8177778720855713,
      "sampling/sampling_logp_difference/mean": 0.025196455419063568,
      "step": 293,
      "step_time": 27.23413759199957
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "entropy": 0.04230553897491518,
      "epoch": 0.00588,
      "grad_norm": 0.005148016382008791,
      "kl": 0.6622665030881763,
      "learning_rate": 7.999901387471079e-06,
      "loss": -0.0001,
      "step": 294,
      "step_time": 11.526401772997815
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.03366142028335162,
      "epoch": 0.0059,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.005694986321032047,
      "kl": 0.39196249035501296,
      "learning_rate": 7.99990062155563e-06,
      "loss": 0.0,
      "num_tokens": 15421347.0,
      "reward": 2.391371726989746,
      "reward_std": 0.43072906136512756,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.875,
      "rewards/probe_completion_length/std": 0.33601075410842896,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9607213735580444,
      "rewards/probe_shaping_dominance/std": 0.12690994143486023,
      "rewards/probe_terminal_raw/mean": 0.046875,
      "rewards/probe_terminal_raw/std": 0.1480722874403,
      "rewards/rollout_reward_func/mean": -0.4412246346473694,
      "rewards/rollout_reward_func/std": 0.21457210183143616,
      "sampling/importance_sampling_ratio/max": 1.2205545902252197,
      "sampling/importance_sampling_ratio/mean": 0.9986574053764343,
      "sampling/importance_sampling_ratio/min": 0.7592641115188599,
      "sampling/sampling_logp_difference/max": 0.2809281349182129,
      "sampling/sampling_logp_difference/mean": 0.008172519505023956,
      "step": 295,
      "step_time": 26.643844086999707
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.030476719188300194,
      "epoch": 0.00592,
      "grad_norm": 0.005326179787516594,
      "kl": 0.39566947892306814,
      "learning_rate": 7.999899852677322e-06,
      "loss": 0.0,
      "step": 296,
      "step_time": 12.454534126997714
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.90625,
      "completions/mean_terminated_length": 2.90625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.04483710537169827,
      "epoch": 0.00594,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0021372437477111816,
      "kl": 0.4166623194081088,
      "learning_rate": 7.99989908083616e-06,
      "loss": 0.0,
      "num_tokens": 15523076.0,
      "reward": 2.4664759635925293,
      "reward_std": 0.4568862318992615,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.90625,
      "rewards/probe_completion_length/std": 0.2961445748806,
      "rewards/probe_invalid_count/mean": 0.0625,
      "rewards/probe_invalid_count/std": 0.24593468010425568,
      "rewards/probe_shaping_dominance/mean": 0.9591568112373352,
      "rewards/probe_shaping_dominance/std": 0.11802849918603897,
      "rewards/probe_terminal_raw/mean": 0.04509654641151428,
      "rewards/probe_terminal_raw/std": 0.1317683309316635,
      "rewards/rollout_reward_func/mean": -0.4565274119377136,
      "rewards/rollout_reward_func/std": 0.26263633370399475,
      "sampling/importance_sampling_ratio/max": 1.3225888013839722,
      "sampling/importance_sampling_ratio/mean": 1.0174564123153687,
      "sampling/importance_sampling_ratio/min": 0.8623110055923462,
      "sampling/sampling_logp_difference/max": 0.27959030866622925,
      "sampling/sampling_logp_difference/mean": 0.008479975163936615,
      "step": 297,
      "step_time": 26.703501694998522
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 0.04334457405639114,
      "epoch": 0.00596,
      "grad_norm": 0.004324762150645256,
      "kl": 0.41364979138597846,
      "learning_rate": 7.999898306032144e-06,
      "loss": 0.0,
      "step": 298,
      "step_time": 11.624797897999088
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 2.9375,
      "completions/mean_terminated_length": 2.9375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 0.039461553949308836,
      "epoch": 0.00598,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0021312020253390074,
      "kl": 0.4295559982638224,
      "learning_rate": 7.999897528265272e-06,
      "loss": 0.0,
      "num_tokens": 15625505.0,
      "reward": 2.4885663986206055,
      "reward_std": 0.32209959626197815,
      "rewards/format_guard/mean": -0.05000000074505806,
      "rewards/format_guard/std": 0.0,
      "rewards/probe_completion_length/mean": 1.9375,
      "rewards/probe_completion_length/std": 0.24593468010425568,
      "rewards/probe_invalid_count/mean": 0.0,
      "rewards/probe_invalid_count/std": 0.0,
      "rewards/probe_shaping_dominance/mean": 0.9723982810974121,
      "rewards/probe_shaping_dominance/std": 0.10864228010177612,
      "rewards/probe_terminal_raw/mean": 0.03125,
      "rewards/probe_terminal_raw/std": 0.12296734005212784,
      "rewards/rollout_reward_func/mean": -0.40258198976516724,
      "rewards/rollout_reward_func/std": 0.1721213161945343,
      "sampling/importance_sampling_ratio/max": 1.015625,
      "sampling/importance_sampling_ratio/mean": 0.9557619690895081,
      "sampling/importance_sampling_ratio/min": 0.3387709856033325,
      "sampling/sampling_logp_difference/max": 1.0839133262634277,
      "sampling/sampling_logp_difference/mean": 0.021332627162337303,
      "step": 299,
      "step_time": 26.165180010000768
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "entropy": 0.04107913846030442,
      "epoch": 0.006,
      "grad_norm": 0.0022343825548887253,
      "kl": 0.42880946584045887,
      "learning_rate": 7.999896747535546e-06,
      "loss": 0.0,
      "step": 300,
      "step_time": 12.217135184999279
    }
  ],
  "logging_steps": 1.0,
  "max_steps": 100000,
  "num_input_tokens_seen": 15625505,
  "num_train_epochs": 2,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}