{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.20366598778004075,
  "eval_steps": 500,
  "global_step": 500,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1371.0,
      "completions/max_terminated_length": 1371.0,
      "completions/mean_length": 454.40625,
      "completions/mean_terminated_length": 454.40625,
      "completions/min_length": 71.0,
      "completions/min_terminated_length": 71.0,
      "epoch": 0.0004073319755600815,
      "grad_norm": 0.1175210377671042,
      "kl": 9.036064147949219e-05,
      "learning_rate": 0.0,
      "loss": -0.0186,
      "num_tokens": 29437.0,
      "reward": 3.1875,
      "reward_std": 1.1310166120529175,
      "rewards/cargo_build_reward": 0.4375,
      "rewards/cargo_clippy_reward": 0.4375,
      "rewards/cargo_test_reward": 0.0625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.6875,
      "rewards/tests_have_asserts_reward": 0.34375,
      "step": 1
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 995.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 371.28125,
      "completions/mean_terminated_length": 383.258064516129,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 99.0,
      "epoch": 0.000814663951120163,
      "grad_norm": 0.132212480221256,
      "kl": 8.082389831542969e-05,
      "learning_rate": 1e-07,
      "loss": -0.0175,
      "num_tokens": 60654.0,
      "reward": 3.484375,
      "reward_std": 1.3426176309585571,
      "rewards/cargo_build_reward": 0.5,
      "rewards/cargo_clippy_reward": 0.46875,
      "rewards/cargo_test_reward": 0.09375,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.90625,
      "rewards/test_block_count_reward": 0.8125,
      "rewards/tests_have_asserts_reward": 0.53125,
      "step": 2
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 599.0,
      "completions/max_terminated_length": 599.0,
      "completions/mean_length": 213.21875,
      "completions/mean_terminated_length": 213.21875,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.0012219959266802445,
      "grad_norm": 0.12912013288705376,
      "kl": 0.0001004934310913086,
      "learning_rate": 2e-07,
      "loss": -0.0111,
      "num_tokens": 82117.0,
      "reward": 4.421875,
      "reward_std": 1.8205678462982178,
      "rewards/cargo_build_reward": 0.6875,
      "rewards/cargo_clippy_reward": 0.6875,
      "rewards/cargo_test_reward": 0.3125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 0.9375,
      "rewards/test_block_count_reward": 0.8125,
      "rewards/tests_have_asserts_reward": 0.578125,
      "step": 3
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 630.0,
      "completions/max_terminated_length": 630.0,
      "completions/mean_length": 303.4375,
      "completions/mean_terminated_length": 303.4375,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.001629327902240326,
      "grad_norm": 0.12375396685149824,
      "kl": 0.00011289119720458984,
      "learning_rate": 3e-07,
      "loss": -0.0151,
      "num_tokens": 106875.0,
      "reward": 4.046875,
      "reward_std": 1.3957082033157349,
      "rewards/cargo_build_reward": 0.6875,
      "rewards/cargo_clippy_reward": 0.6875,
      "rewards/cargo_test_reward": 0.21875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 0.875,
      "rewards/test_block_count_reward": 0.71875,
      "rewards/tests_have_asserts_reward": 0.5,
      "step": 4
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 608.0,
      "completions/max_terminated_length": 608.0,
      "completions/mean_length": 367.9375,
      "completions/mean_terminated_length": 367.9375,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "epoch": 0.002036659877800407,
      "grad_norm": 0.18338271433197806,
      "kl": 9.357929229736328e-05,
      "learning_rate": 4e-07,
      "loss": -0.0004,
      "num_tokens": 133297.0,
      "reward": 4.5859375,
      "reward_std": 1.7285411357879639,
      "rewards/cargo_build_reward": 0.5625,
      "rewards/cargo_clippy_reward": 0.5625,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.9375,
      "rewards/tests_have_asserts_reward": 0.8046875,
      "step": 5
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 977.0,
      "completions/max_terminated_length": 977.0,
      "completions/mean_length": 286.1875,
      "completions/mean_terminated_length": 286.1875,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.002443991853360489,
      "grad_norm": 0.1225919146096492,
      "kl": 0.00015878677368164062,
      "learning_rate": 5e-07,
      "loss": -0.0025,
      "num_tokens": 156767.0,
      "reward": 3.5859375,
      "reward_std": 1.216860055923462,
      "rewards/cargo_build_reward": 0.46875,
      "rewards/cargo_clippy_reward": 0.46875,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 0.875,
      "rewards/test_block_count_reward": 0.75,
      "rewards/tests_have_asserts_reward": 0.3359375,
      "step": 6
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 549.0,
      "completions/max_terminated_length": 549.0,
      "completions/mean_length": 220.15625,
      "completions/mean_terminated_length": 220.15625,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "epoch": 0.0028513238289205704,
      "grad_norm": 0.1586147171757847,
      "kl": 0.00019216537475585938,
      "learning_rate": 6e-07,
      "loss": -0.0151,
      "num_tokens": 178516.0,
      "reward": 3.890625,
      "reward_std": 1.9172019958496094,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.21875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 0.875,
      "rewards/test_block_count_reward": 0.625,
      "rewards/tests_have_asserts_reward": 0.453125,
      "step": 7
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1825.0,
      "completions/max_terminated_length": 1825.0,
      "completions/mean_length": 473.34375,
      "completions/mean_terminated_length": 473.34375,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "epoch": 0.003258655804480652,
      "grad_norm": 0.0793017248205778,
      "kl": 9.167194366455078e-05,
      "learning_rate": 7e-07,
      "loss": -0.004,
      "num_tokens": 209255.0,
      "reward": 3.96875,
      "reward_std": 0.6764461398124695,
      "rewards/cargo_build_reward": 0.4375,
      "rewards/cargo_clippy_reward": 0.4375,
      "rewards/cargo_test_reward": 0.21875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.875,
      "rewards/tests_have_asserts_reward": 0.71875,
      "step": 8
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 788.0,
      "completions/max_terminated_length": 788.0,
      "completions/mean_length": 317.90625,
      "completions/mean_terminated_length": 317.90625,
      "completions/min_length": 66.0,
      "completions/min_terminated_length": 66.0,
      "epoch": 0.0036659877800407333,
      "grad_norm": 0.1202053154821433,
      "kl": 0.00035762786865234375,
      "learning_rate": 8e-07,
      "loss": -0.0066,
      "num_tokens": 234740.0,
      "reward": 3.4296875,
      "reward_std": 1.3169221878051758,
      "rewards/cargo_build_reward": 0.53125,
      "rewards/cargo_clippy_reward": 0.53125,
      "rewards/cargo_test_reward": 0.15625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 0.9375,
      "rewards/test_block_count_reward": 0.625,
      "rewards/tests_have_asserts_reward": 0.3046875,
      "step": 9
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1046.0,
      "completions/max_terminated_length": 1046.0,
      "completions/mean_length": 334.8125,
      "completions/mean_terminated_length": 334.8125,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "epoch": 0.004073319755600814,
      "grad_norm": 0.12952392522902093,
      "kl": 0.0002543926239013672,
      "learning_rate": 9e-07,
      "loss": -0.017,
      "num_tokens": 260822.0,
      "reward": 3.375,
      "reward_std": 1.4019556045532227,
      "rewards/cargo_build_reward": 0.4375,
      "rewards/cargo_clippy_reward": 0.4375,
      "rewards/cargo_test_reward": 0.125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.6875,
      "rewards/tests_have_asserts_reward": 0.40625,
      "step": 10
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1046.0,
      "completions/max_terminated_length": 1046.0,
      "completions/mean_length": 430.3125,
      "completions/mean_terminated_length": 430.3125,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "epoch": 0.004480651731160896,
      "grad_norm": 0.09429395732660069,
      "kl": 0.0001468658447265625,
      "learning_rate": 1e-06,
      "loss": -0.011,
      "num_tokens": 289576.0,
      "reward": 3.1796875,
      "reward_std": 0.8280896544456482,
      "rewards/cargo_build_reward": 0.375,
      "rewards/cargo_clippy_reward": 0.375,
      "rewards/cargo_test_reward": 0.0,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.84375,
      "rewards/tests_have_asserts_reward": 0.5390625,
      "step": 11
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 586.0,
      "completions/max_terminated_length": 586.0,
      "completions/mean_length": 261.15625,
      "completions/mean_terminated_length": 261.15625,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "epoch": 0.004887983706720978,
      "grad_norm": 0.11577175478294834,
      "kl": 0.0003638267517089844,
      "learning_rate": 1e-06,
      "loss": 0.0002,
      "num_tokens": 312557.0,
      "reward": 4.9453125,
      "reward_std": 1.4855623245239258,
      "rewards/cargo_build_reward": 0.71875,
      "rewards/cargo_clippy_reward": 0.71875,
      "rewards/cargo_test_reward": 0.4375,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.9375,
      "rewards/tests_have_asserts_reward": 0.7109375,
      "step": 12
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 770.0,
      "completions/max_terminated_length": 770.0,
      "completions/mean_length": 269.65625,
      "completions/mean_terminated_length": 269.65625,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.005295315682281059,
      "grad_norm": 0.14930968488555413,
      "kl": 0.00039386749267578125,
      "learning_rate": 1e-06,
      "loss": -0.0003,
      "num_tokens": 336546.0,
      "reward": 4.2109375,
      "reward_std": 1.5795735120773315,
      "rewards/cargo_build_reward": 0.71875,
      "rewards/cargo_clippy_reward": 0.71875,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.75,
      "rewards/tests_have_asserts_reward": 0.3671875,
      "step": 13
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 817.0,
      "completions/max_terminated_length": 817.0,
      "completions/mean_length": 319.6875,
      "completions/mean_terminated_length": 319.6875,
      "completions/min_length": 85.0,
      "completions/min_terminated_length": 85.0,
      "epoch": 0.005702647657841141,
      "grad_norm": 0.1382750920092372,
      "kl": 0.00045013427734375,
      "learning_rate": 1e-06,
      "loss": -0.0119,
      "num_tokens": 361824.0,
      "reward": 3.90625,
      "reward_std": 1.2457867860794067,
      "rewards/cargo_build_reward": 0.46875,
      "rewards/cargo_clippy_reward": 0.46875,
      "rewards/cargo_test_reward": 0.25,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 0.9375,
      "rewards/test_block_count_reward": 0.84375,
      "rewards/tests_have_asserts_reward": 0.609375,
      "step": 14
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 641.0,
      "completions/max_terminated_length": 641.0,
      "completions/mean_length": 315.90625,
      "completions/mean_terminated_length": 315.90625,
      "completions/min_length": 72.0,
      "completions/min_terminated_length": 72.0,
      "epoch": 0.006109979633401222,
      "grad_norm": 0.10016667125488203,
      "kl": 0.0008392333984375,
      "learning_rate": 1e-06,
      "loss": -0.0045,
      "num_tokens": 387285.0,
      "reward": 3.7734375,
      "reward_std": 0.9530363082885742,
      "rewards/cargo_build_reward": 0.53125,
      "rewards/cargo_clippy_reward": 0.53125,
      "rewards/cargo_test_reward": 0.21875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.75,
      "rewards/tests_have_asserts_reward": 0.3984375,
      "step": 15
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 993.0,
      "completions/max_terminated_length": 993.0,
      "completions/mean_length": 285.28125,
      "completions/mean_terminated_length": 285.28125,
      "completions/min_length": 58.0,
      "completions/min_terminated_length": 58.0,
      "epoch": 0.006517311608961304,
      "grad_norm": 0.13715469155008012,
      "kl": 0.0009860992431640625,
      "learning_rate": 1e-06,
      "loss": 0.0053,
      "num_tokens": 411862.0,
      "reward": 4.03125,
      "reward_std": 1.5449090003967285,
      "rewards/cargo_build_reward": 0.625,
      "rewards/cargo_clippy_reward": 0.625,
      "rewards/cargo_test_reward": 0.25,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.84375,
      "rewards/tests_have_asserts_reward": 0.40625,
      "step": 16
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 607.0,
      "completions/max_terminated_length": 607.0,
      "completions/mean_length": 267.75,
      "completions/mean_terminated_length": 267.75,
      "completions/min_length": 37.0,
      "completions/min_terminated_length": 37.0,
      "epoch": 0.006924643584521385,
      "grad_norm": 0.0866997914412378,
      "kl": 0.0011415481567382812,
      "learning_rate": 1e-06,
      "loss": -0.002,
      "num_tokens": 436118.0,
      "reward": 3.5703125,
      "reward_std": 1.0962092876434326,
      "rewards/cargo_build_reward": 0.5,
      "rewards/cargo_clippy_reward": 0.5,
      "rewards/cargo_test_reward": 0.09375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 0.84375,
      "rewards/test_block_count_reward": 0.8125,
      "rewards/tests_have_asserts_reward": 0.6328125,
      "step": 17
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 450.0,
      "completions/max_terminated_length": 450.0,
      "completions/mean_length": 220.625,
      "completions/mean_terminated_length": 220.625,
      "completions/min_length": 92.0,
      "completions/min_terminated_length": 92.0,
      "epoch": 0.007331975560081467,
      "grad_norm": 0.10514629633326943,
      "kl": 0.001956939697265625,
      "learning_rate": 1e-06,
      "loss": -0.0009,
      "num_tokens": 458362.0,
      "reward": 4.6796875,
      "reward_std": 1.4177031517028809,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.40625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.875,
      "rewards/tests_have_asserts_reward": 0.6484375,
      "step": 18
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 720.0,
      "completions/max_terminated_length": 720.0,
      "completions/mean_length": 398.3125,
      "completions/mean_terminated_length": 398.3125,
      "completions/min_length": 52.0,
      "completions/min_terminated_length": 52.0,
      "epoch": 0.007739307535641548,
      "grad_norm": 0.11106316659149937,
      "kl": 0.001277923583984375,
      "learning_rate": 1e-06,
      "loss": -0.0067,
      "num_tokens": 485228.0,
      "reward": 3.671875,
      "reward_std": 0.9971106052398682,
      "rewards/cargo_build_reward": 0.5,
      "rewards/cargo_clippy_reward": 0.5,
      "rewards/cargo_test_reward": 0.0,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.90625,
      "rewards/tests_have_asserts_reward": 0.71875,
      "step": 19
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 827.0,
      "completions/max_terminated_length": 827.0,
      "completions/mean_length": 363.21875,
      "completions/mean_terminated_length": 363.21875,
      "completions/min_length": 65.0,
      "completions/min_terminated_length": 65.0,
      "epoch": 0.008146639511201629,
      "grad_norm": 0.09132485848606724,
      "kl": 0.0019779205322265625,
      "learning_rate": 1e-06,
      "loss": -0.0124,
      "num_tokens": 511587.0,
      "reward": 4.0234375,
      "reward_std": 0.9875117540359497,
      "rewards/cargo_build_reward": 0.5,
      "rewards/cargo_clippy_reward": 0.5,
      "rewards/cargo_test_reward": 0.21875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.9375,
      "rewards/tests_have_asserts_reward": 0.6484375,
      "step": 20
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 651.0,
      "completions/max_terminated_length": 651.0,
      "completions/mean_length": 350.71875,
      "completions/mean_terminated_length": 362.03225806451616,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 170.0,
      "epoch": 0.008553971486761711,
      "grad_norm": 0.10923375708807619,
      "kl": 0.002239227294921875,
      "learning_rate": 1e-06,
      "loss": -0.0095,
      "num_tokens": 541231.0,
      "reward": 3.4765625,
      "reward_std": 0.9791522026062012,
      "rewards/cargo_build_reward": 0.375,
      "rewards/cargo_clippy_reward": 0.375,
      "rewards/cargo_test_reward": 0.21875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.8125,
      "rewards/tests_have_asserts_reward": 0.3828125,
      "step": 21
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 622.0,
      "completions/max_terminated_length": 622.0,
      "completions/mean_length": 339.5,
      "completions/mean_terminated_length": 339.5,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "epoch": 0.008961303462321792,
      "grad_norm": 0.11906376751591456,
      "kl": 0.002227783203125,
      "learning_rate": 1e-06,
      "loss": 0.0048,
      "num_tokens": 566959.0,
      "reward": 5.0703125,
      "reward_std": 1.1388654708862305,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.46875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8203125,
      "step": 22
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 707.0,
      "completions/max_terminated_length": 707.0,
      "completions/mean_length": 373.84375,
      "completions/mean_terminated_length": 373.84375,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "epoch": 0.009368635437881873,
      "grad_norm": 0.11868349885586216,
      "kl": 0.0027923583984375,
      "learning_rate": 1e-06,
      "loss": -0.0035,
      "num_tokens": 593706.0,
      "reward": 3.890625,
      "reward_std": 1.3067610263824463,
      "rewards/cargo_build_reward": 0.59375,
      "rewards/cargo_clippy_reward": 0.59375,
      "rewards/cargo_test_reward": 0.1875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 0.9375,
      "rewards/test_block_count_reward": 0.84375,
      "rewards/tests_have_asserts_reward": 0.46875,
      "step": 23
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 797.0,
      "completions/max_terminated_length": 797.0,
      "completions/mean_length": 379.25,
      "completions/mean_terminated_length": 404.53333333333336,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 163.0,
      "epoch": 0.009775967413441956,
      "grad_norm": 0.14299534105777312,
      "kl": 0.00243377685546875,
      "learning_rate": 1e-06,
      "loss": -0.0318,
      "num_tokens": 628716.0,
      "reward": 3.53125,
      "reward_std": 1.060036540031433,
      "rewards/cargo_build_reward": 0.4375,
      "rewards/cargo_clippy_reward": 0.4375,
      "rewards/cargo_test_reward": 0.09375,
      "rewards/code_block_count_reward": 0.9375,
      "rewards/non_empty_reward": 0.9375,
      "rewards/test_block_count_reward": 0.9375,
      "rewards/tests_have_asserts_reward": 0.59375,
      "step": 24
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 643.0,
      "completions/max_terminated_length": 643.0,
      "completions/mean_length": 281.03125,
      "completions/mean_terminated_length": 281.03125,
      "completions/min_length": 82.0,
      "completions/min_terminated_length": 82.0,
      "epoch": 0.010183299389002037,
      "grad_norm": 0.06393125755602662,
      "kl": 0.0063934326171875,
      "learning_rate": 1e-06,
      "loss": -0.002,
      "num_tokens": 652525.0,
      "reward": 3.21875,
      "reward_std": 0.6925716400146484,
      "rewards/cargo_build_reward": 0.25,
      "rewards/cargo_clippy_reward": 0.25,
      "rewards/cargo_test_reward": 0.0625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.90625,
      "rewards/tests_have_asserts_reward": 0.640625,
      "step": 25
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 573.0,
      "completions/max_terminated_length": 573.0,
      "completions/mean_length": 283.5,
      "completions/mean_terminated_length": 283.5,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.010590631364562118,
      "grad_norm": 0.19607205581028808,
      "kl": 0.0119476318359375,
      "learning_rate": 1e-06,
      "loss": -0.0135,
      "num_tokens": 677125.0,
      "reward": 3.9921875,
      "reward_std": 2.011322259902954,
      "rewards/cargo_build_reward": 0.6875,
      "rewards/cargo_clippy_reward": 0.6875,
      "rewards/cargo_test_reward": 0.40625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 0.84375,
      "rewards/test_block_count_reward": 0.59375,
      "rewards/tests_have_asserts_reward": 0.1640625,
      "step": 26
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 543.0,
      "completions/max_terminated_length": 543.0,
      "completions/mean_length": 276.71875,
      "completions/mean_terminated_length": 276.71875,
      "completions/min_length": 61.0,
      "completions/min_terminated_length": 61.0,
      "epoch": 0.010997963340122199,
      "grad_norm": 0.12244939393928782,
      "kl": 0.0082244873046875,
      "learning_rate": 1e-06,
      "loss": -0.0051,
      "num_tokens": 701276.0,
      "reward": 4.4453125,
      "reward_std": 1.0721098184585571,
      "rewards/cargo_build_reward": 0.71875,
      "rewards/cargo_clippy_reward": 0.71875,
      "rewards/cargo_test_reward": 0.25,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.9375,
      "rewards/tests_have_asserts_reward": 0.5703125,
      "step": 27
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1031.0,
      "completions/max_terminated_length": 1031.0,
      "completions/mean_length": 340.6875,
      "completions/mean_terminated_length": 340.6875,
      "completions/min_length": 67.0,
      "completions/min_terminated_length": 67.0,
      "epoch": 0.011405295315682282,
      "grad_norm": 0.11316959025254554,
      "kl": 0.0091094970703125,
      "learning_rate": 1e-06,
      "loss": -0.0098,
      "num_tokens": 727458.0,
      "reward": 4.03125,
      "reward_std": 1.1808925867080688,
      "rewards/cargo_build_reward": 0.5,
      "rewards/cargo_clippy_reward": 0.5,
      "rewards/cargo_test_reward": 0.3125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.8125,
      "rewards/tests_have_asserts_reward": 0.5,
      "step": 28
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 731.0,
      "completions/max_terminated_length": 731.0,
      "completions/mean_length": 351.28125,
      "completions/mean_terminated_length": 351.28125,
      "completions/min_length": 84.0,
      "completions/min_terminated_length": 84.0,
      "epoch": 0.011812627291242363,
      "grad_norm": 0.11011504762441725,
      "kl": 0.01514434814453125,
      "learning_rate": 1e-06,
      "loss": -0.0062,
      "num_tokens": 753907.0,
      "reward": 4.3046875,
      "reward_std": 1.0233951807022095,
      "rewards/cargo_build_reward": 0.625,
      "rewards/cargo_clippy_reward": 0.625,
      "rewards/cargo_test_reward": 0.21875,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.84375,
      "rewards/tests_have_asserts_reward": 0.7421875,
      "step": 29
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 975.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 339.5625,
      "completions/mean_terminated_length": 339.5625,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "epoch": 0.012219959266802444,
      "grad_norm": 0.0720316910153896,
      "kl": 0.01592254638671875,
      "learning_rate": 1e-06,
      "loss": -0.0011,
      "num_tokens": 779765.0,
      "reward": 3.6484375,
      "reward_std": 0.7949134111404419,
      "rewards/cargo_build_reward": 0.53125,
      "rewards/cargo_clippy_reward": 0.53125,
      "rewards/cargo_test_reward": 0.0,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 0.9375,
      "rewards/test_block_count_reward": 0.90625,
      "rewards/tests_have_asserts_reward": 0.6953125,
      "step": 30
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 654.0,
      "completions/max_terminated_length": 654.0,
      "completions/mean_length": 313.59375,
      "completions/mean_terminated_length": 313.59375,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "epoch": 0.012627291242362525,
      "grad_norm": 0.1388954396991544,
      "kl": 0.0106353759765625,
      "learning_rate": 1e-06,
      "loss": -0.0003,
      "num_tokens": 804544.0,
      "reward": 4.3671875,
      "reward_std": 1.3792965412139893,
      "rewards/cargo_build_reward": 0.71875,
      "rewards/cargo_clippy_reward": 0.71875,
      "rewards/cargo_test_reward": 0.21875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.9375,
      "rewards/tests_have_asserts_reward": 0.5234375,
      "step": 31
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2822.0,
      "completions/max_terminated_length": 672.0,
      "completions/mean_length": 419.40625,
      "completions/mean_terminated_length": 341.9032258064516,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "epoch": 0.013034623217922607,
      "grad_norm": 0.1324050390267588,
      "kl": 0.00836944580078125,
      "learning_rate": 1e-06,
      "loss": 0.0489,
      "num_tokens": 832677.0,
      "reward": 3.71875,
      "reward_std": 1.5190155506134033,
      "rewards/cargo_build_reward": 0.40625,
      "rewards/cargo_clippy_reward": 0.40625,
      "rewards/cargo_test_reward": 0.15625,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.9375,
      "rewards/tests_have_asserts_reward": 0.671875,
      "step": 32
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 616.0,
      "completions/max_terminated_length": 616.0,
      "completions/mean_length": 307.90625,
      "completions/mean_terminated_length": 307.90625,
      "completions/min_length": 38.0,
      "completions/min_terminated_length": 38.0,
      "epoch": 0.013441955193482688,
      "grad_norm": 0.08163626831475472,
      "kl": 0.0209808349609375,
      "learning_rate": 1e-06,
      "loss": -0.0067,
      "num_tokens": 857042.0,
      "reward": 4.515625,
      "reward_std": 1.0991976261138916,
      "rewards/cargo_build_reward": 0.5,
      "rewards/cargo_clippy_reward": 0.5,
      "rewards/cargo_test_reward": 0.40625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.9375,
      "rewards/tests_have_asserts_reward": 0.765625,
      "step": 33
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 896.0,
      "completions/max_terminated_length": 896.0,
      "completions/mean_length": 392.40625,
      "completions/mean_terminated_length": 392.40625,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "epoch": 0.01384928716904277,
      "grad_norm": 0.12031590107086831,
      "kl": 0.01251220703125,
      "learning_rate": 1e-06,
      "loss": 0.0063,
      "num_tokens": 884423.0,
      "reward": 3.59375,
      "reward_std": 1.2008159160614014,
      "rewards/cargo_build_reward": 0.5,
      "rewards/cargo_clippy_reward": 0.5,
      "rewards/cargo_test_reward": 0.125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.90625,
      "rewards/tests_have_asserts_reward": 0.390625,
      "step": 34
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 732.0,
      "completions/max_terminated_length": 732.0,
      "completions/mean_length": 306.3125,
      "completions/mean_terminated_length": 306.3125,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "epoch": 0.014256619144602852,
      "grad_norm": 0.1399342572201649,
      "kl": 0.0334320068359375,
      "learning_rate": 1e-06,
      "loss": -0.0,
      "num_tokens": 909393.0,
      "reward": 4.5625,
      "reward_std": 1.0657768249511719,
      "rewards/cargo_build_reward": 0.6875,
      "rewards/cargo_clippy_reward": 0.6875,
      "rewards/cargo_test_reward": 0.40625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.90625,
      "rewards/tests_have_asserts_reward": 0.453125,
      "step": 35
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 762.0,
      "completions/max_terminated_length": 762.0,
      "completions/mean_length": 463.9375,
      "completions/mean_terminated_length": 463.9375,
      "completions/min_length": 248.0,
      "completions/min_terminated_length": 248.0,
      "epoch": 0.014663951120162933,
      "grad_norm": 0.1078256076237383,
      "kl": 0.0101165771484375,
      "learning_rate": 1e-06,
      "loss": 0.0049,
      "num_tokens": 939071.0,
      "reward": 3.4140625,
      "reward_std": 0.945007860660553,
      "rewards/cargo_build_reward": 0.34375,
      "rewards/cargo_clippy_reward": 0.34375,
      "rewards/cargo_test_reward": 0.0,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.9375,
      "rewards/tests_have_asserts_reward": 0.7578125,
      "step": 36
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 796.0,
      "completions/max_terminated_length": 796.0,
      "completions/mean_length": 342.4375,
      "completions/mean_terminated_length": 342.4375,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "epoch": 0.015071283095723014,
      "grad_norm": 0.11291825337657914,
      "kl": 0.0157623291015625,
      "learning_rate": 1e-06,
      "loss": 0.0037,
      "num_tokens": 964677.0,
      "reward": 4.2578125,
      "reward_std": 1.248620629310608,
      "rewards/cargo_build_reward": 0.53125,
      "rewards/cargo_clippy_reward": 0.53125,
      "rewards/cargo_test_reward": 0.1875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.8359375,
      "step": 37
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1294.0,
      "completions/max_terminated_length": 1294.0,
      "completions/mean_length": 531.9375,
      "completions/mean_terminated_length": 531.9375,
      "completions/min_length": 86.0,
      "completions/min_terminated_length": 86.0,
      "epoch": 0.015478615071283095,
      "grad_norm": 0.09047481558569173,
      "kl": 0.0214385986328125,
      "learning_rate": 1e-06,
      "loss": -0.0087,
      "num_tokens": 997251.0,
      "reward": 3.078125,
      "reward_std": 0.8194274306297302,
      "rewards/cargo_build_reward": 0.25,
      "rewards/cargo_clippy_reward": 0.25,
      "rewards/cargo_test_reward": 0.0,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.9375,
      "rewards/tests_have_asserts_reward": 0.609375,
      "step": 38
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 590.0,
      "completions/max_terminated_length": 590.0,
      "completions/mean_length": 335.34375,
      "completions/mean_terminated_length": 346.16129032258067,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 143.0,
      "epoch": 0.015885947046843176,
      "grad_norm": 0.12381201903810035,
      "kl": 0.0168304443359375,
      "learning_rate": 1e-06,
      "loss": -0.0139,
      "num_tokens": 1026833.0,
      "reward": 3.8515625,
      "reward_std": 1.401503086090088,
      "rewards/cargo_build_reward": 0.5,
      "rewards/cargo_clippy_reward": 0.5,
      "rewards/cargo_test_reward": 0.125,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.9375,
      "rewards/tests_have_asserts_reward": 0.6796875,
      "step": 39
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 638.0,
      "completions/max_terminated_length": 638.0,
      "completions/mean_length": 352.8125,
      "completions/mean_terminated_length": 352.8125,
      "completions/min_length": 182.0,
      "completions/min_terminated_length": 182.0,
      "epoch": 0.016293279022403257,
      "grad_norm": 0.13170498818621618,
      "kl": 0.0164947509765625,
      "learning_rate": 1e-06,
      "loss": 0.003,
      "num_tokens": 1053755.0,
      "reward": 4.6640625,
      "reward_std": 1.2833765745162964,
      "rewards/cargo_build_reward": 0.71875,
      "rewards/cargo_clippy_reward": 0.71875,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.6640625,
      "step": 40
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 557.0,
      "completions/max_terminated_length": 557.0,
      "completions/mean_length": 323.5625,
      "completions/mean_terminated_length": 334.0,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 111.0,
      "epoch": 0.01670061099796334,
      "grad_norm": 0.10743206140354492,
      "kl": 0.021087646484375,
      "learning_rate": 1e-06,
      "loss": -0.0139,
      "num_tokens": 1083248.0,
      "reward": 3.3828125,
      "reward_std": 1.0023860931396484,
      "rewards/cargo_build_reward": 0.4375,
      "rewards/cargo_clippy_reward": 0.4375,
      "rewards/cargo_test_reward": 0.0,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.9375,
      "rewards/tests_have_asserts_reward": 0.5859375,
      "step": 41
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 644.0,
      "completions/max_terminated_length": 644.0,
      "completions/mean_length": 413.8125,
      "completions/mean_terminated_length": 413.8125,
      "completions/min_length": 252.0,
      "completions/min_terminated_length": 252.0,
      "epoch": 0.017107942973523423,
      "grad_norm": 0.10643379964820185,
      "kl": 0.015838623046875,
      "learning_rate": 1e-06,
      "loss": 0.0031,
      "num_tokens": 1111402.0,
      "reward": 4.0625,
      "reward_std": 1.0337579250335693,
      "rewards/cargo_build_reward": 0.59375,
      "rewards/cargo_clippy_reward": 0.59375,
      "rewards/cargo_test_reward": 0.1875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.5,
      "step": 42
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 663.0,
      "completions/max_terminated_length": 663.0,
      "completions/mean_length": 358.9375,
      "completions/mean_terminated_length": 358.9375,
      "completions/min_length": 59.0,
      "completions/min_terminated_length": 59.0,
      "epoch": 0.017515274949083504,
      "grad_norm": 0.11035849866551167,
      "kl": 0.18438720703125,
      "learning_rate": 1e-06,
      "loss": -0.0102,
      "num_tokens": 1137288.0,
      "reward": 3.390625,
      "reward_std": 0.7923446893692017,
      "rewards/cargo_build_reward": 0.40625,
      "rewards/cargo_clippy_reward": 0.40625,
      "rewards/cargo_test_reward": 0.03125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 0.90625,
      "rewards/test_block_count_reward": 0.9375,
      "rewards/tests_have_asserts_reward": 0.640625,
      "step": 43
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 859.0,
      "completions/max_terminated_length": 859.0,
      "completions/mean_length": 349.5,
      "completions/mean_terminated_length": 349.5,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "epoch": 0.017922606924643585,
      "grad_norm": 0.13138570856112985,
      "kl": 0.02716064453125,
      "learning_rate": 1e-06,
      "loss": 0.0171,
      "num_tokens": 1163224.0,
      "reward": 5.046875,
      "reward_std": 1.3160395622253418,
      "rewards/cargo_build_reward": 0.75,
      "rewards/cargo_clippy_reward": 0.75,
      "rewards/cargo_test_reward": 0.5,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.5625,
      "step": 44
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1063.0,
      "completions/max_terminated_length": 1063.0,
      "completions/mean_length": 451.375,
      "completions/mean_terminated_length": 451.375,
      "completions/min_length": 192.0,
      "completions/min_terminated_length": 192.0,
      "epoch": 0.018329938900203666,
      "grad_norm": 0.12717083019014958,
      "kl": 0.014984130859375,
      "learning_rate": 1e-06,
      "loss": 0.0054,
      "num_tokens": 1192716.0,
      "reward": 3.5625,
      "reward_std": 1.2201060056686401,
      "rewards/cargo_build_reward": 0.40625,
      "rewards/cargo_clippy_reward": 0.40625,
      "rewards/cargo_test_reward": 0.09375,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.625,
      "step": 45
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 607.0,
      "completions/max_terminated_length": 607.0,
      "completions/mean_length": 375.59375,
      "completions/mean_terminated_length": 375.59375,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "epoch": 0.018737270875763747,
      "grad_norm": 0.11688640840387661,
      "kl": 0.085601806640625,
      "learning_rate": 1e-06,
      "loss": 0.002,
      "num_tokens": 1219279.0,
      "reward": 3.8203125,
      "reward_std": 1.2091054916381836,
      "rewards/cargo_build_reward": 0.4375,
      "rewards/cargo_clippy_reward": 0.4375,
      "rewards/cargo_test_reward": 0.15625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.6796875,
      "step": 46
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 579.0,
      "completions/max_terminated_length": 579.0,
      "completions/mean_length": 363.3125,
      "completions/mean_terminated_length": 363.3125,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "epoch": 0.019144602851323828,
      "grad_norm": 0.1207956799753875,
      "kl": 0.024078369140625,
      "learning_rate": 1e-06,
      "loss": 0.004,
      "num_tokens": 1246617.0,
      "reward": 3.953125,
      "reward_std": 1.2309598922729492,
      "rewards/cargo_build_reward": 0.46875,
      "rewards/cargo_clippy_reward": 0.46875,
      "rewards/cargo_test_reward": 0.15625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.703125,
      "step": 47
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 539.0,
      "completions/max_terminated_length": 539.0,
      "completions/mean_length": 325.125,
      "completions/mean_terminated_length": 325.125,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "epoch": 0.019551934826883912,
      "grad_norm": 0.10192110556876445,
      "kl": 0.027984619140625,
      "learning_rate": 1e-06,
      "loss": -0.0066,
      "num_tokens": 1272621.0,
      "reward": 4.96875,
      "reward_std": 1.1239038705825806,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.65625,
      "step": 48
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1015.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 368.28125,
      "completions/mean_terminated_length": 368.28125,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "epoch": 0.019959266802443993,
      "grad_norm": 0.09942170746232587,
      "kl": 0.06756591796875,
      "learning_rate": 1e-06,
      "loss": 0.0004,
      "num_tokens": 1299686.0,
      "reward": 4.1171875,
      "reward_std": 1.0894885063171387,
      "rewards/cargo_build_reward": 0.59375,
      "rewards/cargo_clippy_reward": 0.59375,
      "rewards/cargo_test_reward": 0.15625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.6640625,
      "step": 49
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 530.0,
      "completions/max_terminated_length": 530.0,
      "completions/mean_length": 292.25,
      "completions/mean_terminated_length": 292.25,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "epoch": 0.020366598778004074,
      "grad_norm": 0.07022626825674544,
      "kl": 0.034515380859375,
      "learning_rate": 1e-06,
      "loss": -0.0008,
      "num_tokens": 1323846.0,
      "reward": 4.515625,
      "reward_std": 0.8746212124824524,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.21875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.765625,
      "step": 50
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1590.0,
      "completions/max_terminated_length": 1590.0,
      "completions/mean_length": 393.28125,
      "completions/mean_terminated_length": 405.96774193548384,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 139.0,
      "epoch": 0.020773930753564155,
      "grad_norm": 0.17307638223220873,
      "kl": 0.02642822265625,
      "learning_rate": 1e-06,
      "loss": -0.0037,
      "num_tokens": 1355344.0,
      "reward": 4.53125,
      "reward_std": 0.9753614068031311,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.4375,
      "rewards/code_block_count_reward": 0.9375,
      "rewards/non_empty_reward": 0.90625,
      "rewards/test_block_count_reward": 0.875,
      "rewards/tests_have_asserts_reward": 0.53125,
      "step": 51
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 577.0,
      "completions/max_terminated_length": 577.0,
      "completions/mean_length": 276.875,
      "completions/mean_terminated_length": 285.80645161290323,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 117.0,
      "epoch": 0.021181262729124236,
      "grad_norm": 0.1257849939448993,
      "kl": 0.03900146484375,
      "learning_rate": 1e-06,
      "loss": -0.0143,
      "num_tokens": 1382811.0,
      "reward": 3.8359375,
      "reward_std": 1.3329918384552002,
      "rewards/cargo_build_reward": 0.46875,
      "rewards/cargo_clippy_reward": 0.46875,
      "rewards/cargo_test_reward": 0.15625,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.9375,
      "rewards/tests_have_asserts_reward": 0.6640625,
      "step": 52
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1148.0,
      "completions/max_terminated_length": 1148.0,
      "completions/mean_length": 391.5625,
      "completions/mean_terminated_length": 391.5625,
      "completions/min_length": 75.0,
      "completions/min_terminated_length": 75.0,
      "epoch": 0.021588594704684317,
      "grad_norm": 0.11213828920245573,
      "kl": 0.1412353515625,
      "learning_rate": 1e-06,
      "loss": -0.0029,
      "num_tokens": 1409941.0,
      "reward": 4.2578125,
      "reward_std": 0.9867215156555176,
      "rewards/cargo_build_reward": 0.6875,
      "rewards/cargo_clippy_reward": 0.6875,
      "rewards/cargo_test_reward": 0.0625,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.8203125,
      "step": 53
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 799.0,
      "completions/max_terminated_length": 799.0,
      "completions/mean_length": 325.96875,
      "completions/mean_terminated_length": 325.96875,
      "completions/min_length": 91.0,
      "completions/min_terminated_length": 91.0,
      "epoch": 0.021995926680244398,
      "grad_norm": 0.07234987825205394,
      "kl": 0.03948974609375,
      "learning_rate": 1e-06,
      "loss": 0.0023,
      "num_tokens": 1435180.0,
      "reward": 4.9609375,
      "reward_std": 0.8981945514678955,
      "rewards/cargo_build_reward": 0.625,
      "rewards/cargo_clippy_reward": 0.625,
      "rewards/cargo_test_reward": 0.46875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7734375,
      "step": 54
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 613.0,
      "completions/max_terminated_length": 613.0,
      "completions/mean_length": 279.0625,
      "completions/mean_terminated_length": 279.0625,
      "completions/min_length": 52.0,
      "completions/min_terminated_length": 52.0,
      "epoch": 0.02240325865580448,
      "grad_norm": 0.14348224459256667,
      "kl": 0.193603515625,
      "learning_rate": 1e-06,
      "loss": -0.0013,
      "num_tokens": 1459278.0,
      "reward": 4.4140625,
      "reward_std": 1.7242329120635986,
      "rewards/cargo_build_reward": 0.6875,
      "rewards/cargo_clippy_reward": 0.6875,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.9375,
      "rewards/tests_have_asserts_reward": 0.5546875,
      "step": 55
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 590.0,
      "completions/max_terminated_length": 590.0,
      "completions/mean_length": 312.8125,
      "completions/mean_terminated_length": 312.8125,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "epoch": 0.022810590631364563,
      "grad_norm": 0.1167305580985666,
      "kl": 0.04241943359375,
      "learning_rate": 1e-06,
      "loss": 0.0031,
      "num_tokens": 1484160.0,
      "reward": 4.984375,
      "reward_std": 1.2556254863739014,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.78125,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.625,
      "step": 56
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 654.0,
      "completions/max_terminated_length": 654.0,
      "completions/mean_length": 363.09375,
      "completions/mean_terminated_length": 363.09375,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "epoch": 0.023217922606924644,
      "grad_norm": 0.09973532180599194,
      "kl": 0.038330078125,
      "learning_rate": 1e-06,
      "loss": -0.0025,
      "num_tokens": 1511323.0,
      "reward": 3.6171875,
      "reward_std": 1.0292447805404663,
      "rewards/cargo_build_reward": 0.4375,
      "rewards/cargo_clippy_reward": 0.4375,
      "rewards/cargo_test_reward": 0.125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.4921875,
      "step": 57
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 734.0,
      "completions/max_terminated_length": 734.0,
      "completions/mean_length": 393.6875,
      "completions/mean_terminated_length": 393.6875,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "epoch": 0.023625254582484725,
      "grad_norm": 0.1346776199533448,
      "kl": 0.03607177734375,
      "learning_rate": 1e-06,
      "loss": -0.0115,
      "num_tokens": 1539721.0,
      "reward": 4.3359375,
      "reward_std": 1.3484569787979126,
      "rewards/cargo_build_reward": 0.59375,
      "rewards/cargo_clippy_reward": 0.59375,
      "rewards/cargo_test_reward": 0.15625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8359375,
      "step": 58
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 655.0,
      "completions/max_terminated_length": 655.0,
      "completions/mean_length": 331.34375,
      "completions/mean_terminated_length": 331.34375,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "epoch": 0.024032586558044806,
      "grad_norm": 0.13504668058649946,
      "kl": 0.0440673828125,
      "learning_rate": 1e-06,
      "loss": -0.0016,
      "num_tokens": 1565284.0,
      "reward": 4.7578125,
      "reward_std": 1.4678330421447754,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.3125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8203125,
      "step": 59
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 787.0,
      "completions/max_terminated_length": 787.0,
      "completions/mean_length": 401.125,
      "completions/mean_terminated_length": 401.125,
      "completions/min_length": 235.0,
      "completions/min_terminated_length": 235.0,
      "epoch": 0.024439918533604887,
      "grad_norm": 0.1234455225844031,
      "kl": 0.037353515625,
      "learning_rate": 1e-06,
      "loss": -0.0023,
      "num_tokens": 1593368.0,
      "reward": 4.0078125,
      "reward_std": 1.077244520187378,
      "rewards/cargo_build_reward": 0.53125,
      "rewards/cargo_clippy_reward": 0.53125,
      "rewards/cargo_test_reward": 0.09375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7578125,
      "step": 60
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 672.0,
      "completions/max_terminated_length": 672.0,
      "completions/mean_length": 328.4375,
      "completions/mean_terminated_length": 328.4375,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "epoch": 0.02484725050916497,
      "grad_norm": 0.08762858795609393,
      "kl": 0.04791259765625,
      "learning_rate": 1e-06,
      "loss": -0.0024,
      "num_tokens": 1619062.0,
      "reward": 4.90625,
      "reward_std": 0.9854581952095032,
      "rewards/cargo_build_reward": 0.71875,
      "rewards/cargo_clippy_reward": 0.71875,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.78125,
      "step": 61
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 718.0,
      "completions/max_terminated_length": 718.0,
      "completions/mean_length": 309.78125,
      "completions/mean_terminated_length": 309.78125,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "epoch": 0.02525458248472505,
      "grad_norm": 0.09003120864656787,
      "kl": 0.052978515625,
      "learning_rate": 1e-06,
      "loss": 0.0065,
      "num_tokens": 1643903.0,
      "reward": 5.5546875,
      "reward_std": 1.0079970359802246,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.5625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7421875,
      "step": 62
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 942.0,
      "completions/max_terminated_length": 942.0,
      "completions/mean_length": 370.09375,
      "completions/mean_terminated_length": 370.09375,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "epoch": 0.025661914460285134,
      "grad_norm": 0.10777385978776224,
      "kl": 0.0457763671875,
      "learning_rate": 1e-06,
      "loss": 0.0071,
      "num_tokens": 1671162.0,
      "reward": 4.6796875,
      "reward_std": 1.262016773223877,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.6796875,
      "step": 63
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 503.0,
      "completions/max_terminated_length": 503.0,
      "completions/mean_length": 269.25,
      "completions/mean_terminated_length": 269.25,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "epoch": 0.026069246435845215,
      "grad_norm": 0.11339301063035911,
      "kl": 0.05950927734375,
      "learning_rate": 1e-06,
      "loss": -0.0017,
      "num_tokens": 1694594.0,
      "reward": 4.90625,
      "reward_std": 1.3360557556152344,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.609375,
      "step": 64
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1028.0,
      "completions/max_terminated_length": 1028.0,
      "completions/mean_length": 496.65625,
      "completions/mean_terminated_length": 496.65625,
      "completions/min_length": 191.0,
      "completions/min_terminated_length": 191.0,
      "epoch": 0.026476578411405296,
      "grad_norm": 0.12964564093106987,
      "kl": 0.0882568359375,
      "learning_rate": 1e-06,
      "loss": -0.003,
      "num_tokens": 1725039.0,
      "reward": 3.1875,
      "reward_std": 0.9519739151000977,
      "rewards/cargo_build_reward": 0.1875,
      "rewards/cargo_clippy_reward": 0.1875,
      "rewards/cargo_test_reward": 0.0625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.6875,
      "step": 65
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 3610.0,
      "completions/max_terminated_length": 931.0,
      "completions/mean_length": 467.78125,
      "completions/mean_terminated_length": 366.4193548387097,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "epoch": 0.026883910386965377,
      "grad_norm": 0.10976050477139662,
      "kl": 0.037506103515625,
      "learning_rate": 1e-06,
      "loss": 0.0787,
      "num_tokens": 1755272.0,
      "reward": 4.0703125,
      "reward_std": 1.0649116039276123,
      "rewards/cargo_build_reward": 0.46875,
      "rewards/cargo_clippy_reward": 0.46875,
      "rewards/cargo_test_reward": 0.15625,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.8828125,
      "step": 66
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 738.0,
      "completions/max_terminated_length": 738.0,
      "completions/mean_length": 356.53125,
      "completions/mean_terminated_length": 356.53125,
      "completions/min_length": 192.0,
      "completions/min_terminated_length": 192.0,
      "epoch": 0.027291242362525458,
      "grad_norm": 0.14099648252527722,
      "kl": 0.04632568359375,
      "learning_rate": 1e-06,
      "loss": 0.0048,
      "num_tokens": 1782521.0,
      "reward": 4.2578125,
      "reward_std": 1.2450398206710815,
      "rewards/cargo_build_reward": 0.5625,
      "rewards/cargo_clippy_reward": 0.5625,
      "rewards/cargo_test_reward": 0.15625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8203125,
      "step": 67
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 599.0,
      "completions/max_terminated_length": 599.0,
      "completions/mean_length": 289.75,
      "completions/mean_terminated_length": 299.0967741935484,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 123.0,
      "epoch": 0.02769857433808554,
      "grad_norm": 0.13464657563187157,
      "kl": 0.0574951171875,
      "learning_rate": 1e-06,
      "loss": -0.0053,
      "num_tokens": 1810108.0,
      "reward": 4.9140625,
      "reward_std": 1.5103744268417358,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.9140625,
      "step": 68
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 951.0,
      "completions/max_terminated_length": 951.0,
      "completions/mean_length": 453.84375,
      "completions/mean_terminated_length": 453.84375,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "epoch": 0.02810590631364562,
      "grad_norm": 0.14876203658898152,
      "kl": 0.0380859375,
      "learning_rate": 1e-06,
      "loss": 0.0116,
      "num_tokens": 1839087.0,
      "reward": 3.8828125,
      "reward_std": 1.2079672813415527,
      "rewards/cargo_build_reward": 0.4375,
      "rewards/cargo_clippy_reward": 0.4375,
      "rewards/cargo_test_reward": 0.09375,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8671875,
      "step": 69
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 636.0,
      "completions/max_terminated_length": 636.0,
      "completions/mean_length": 293.0625,
      "completions/mean_terminated_length": 302.51612903225805,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 143.0,
      "epoch": 0.028513238289205704,
      "grad_norm": 0.09396813349655643,
      "kl": 0.3768310546875,
      "learning_rate": 1e-06,
      "loss": -0.0073,
      "num_tokens": 1866810.0,
      "reward": 3.96875,
      "reward_std": 0.795851469039917,
      "rewards/cargo_build_reward": 0.625,
      "rewards/cargo_clippy_reward": 0.625,
      "rewards/cargo_test_reward": 0.03125,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.9375,
      "rewards/tests_have_asserts_reward": 0.734375,
      "step": 70
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 779.0,
      "completions/max_terminated_length": 779.0,
      "completions/mean_length": 406.5,
      "completions/mean_terminated_length": 419.61290322580646,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 215.0,
      "epoch": 0.028920570264765785,
      "grad_norm": 0.12598281125371927,
      "kl": 0.03662109375,
      "learning_rate": 1e-06,
      "loss": -0.0031,
      "num_tokens": 1898261.0,
      "reward": 3.703125,
      "reward_std": 1.0857936143875122,
      "rewards/cargo_build_reward": 0.59375,
      "rewards/cargo_clippy_reward": 0.59375,
      "rewards/cargo_test_reward": 0.0,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.578125,
      "step": 71
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 563.0,
      "completions/max_terminated_length": 563.0,
      "completions/mean_length": 278.8125,
      "completions/mean_terminated_length": 278.8125,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "epoch": 0.029327902240325866,
      "grad_norm": 0.10930021044923645,
      "kl": 0.06024169921875,
      "learning_rate": 1e-06,
      "loss": 0.0039,
      "num_tokens": 1921943.0,
      "reward": 4.8984375,
      "reward_std": 1.2769497632980347,
      "rewards/cargo_build_reward": 0.6875,
      "rewards/cargo_clippy_reward": 0.6875,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7734375,
      "step": 72
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1112.0,
      "completions/max_terminated_length": 1112.0,
      "completions/mean_length": 435.65625,
      "completions/mean_terminated_length": 435.65625,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "epoch": 0.029735234215885947,
      "grad_norm": 0.07778383657694855,
      "kl": 0.0411376953125,
      "learning_rate": 1e-06,
      "loss": 0.0005,
      "num_tokens": 1951292.0,
      "reward": 4.1953125,
      "reward_std": 0.9303152561187744,
      "rewards/cargo_build_reward": 0.625,
      "rewards/cargo_clippy_reward": 0.625,
      "rewards/cargo_test_reward": 0.1875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.5859375,
      "step": 73
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 682.0,
      "completions/max_terminated_length": 682.0,
      "completions/mean_length": 345.0625,
      "completions/mean_terminated_length": 345.0625,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "epoch": 0.03014256619144603,
      "grad_norm": 0.11792150853440085,
      "kl": 0.08026123046875,
      "learning_rate": 1e-06,
      "loss": -0.0052,
      "num_tokens": 1976758.0,
      "reward": 4.4453125,
      "reward_std": 0.9902048110961914,
      "rewards/cargo_build_reward": 0.59375,
      "rewards/cargo_clippy_reward": 0.59375,
      "rewards/cargo_test_reward": 0.25,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.8046875,
      "step": 74
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 785.0,
      "completions/max_terminated_length": 785.0,
      "completions/mean_length": 372.6875,
      "completions/mean_terminated_length": 372.6875,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 205.0,
      "epoch": 0.03054989816700611,
      "grad_norm": 0.12309215125848344,
      "kl": 0.05194091796875,
      "learning_rate": 1e-06,
      "loss": -0.0011,
      "num_tokens": 2004436.0,
      "reward": 4.4609375,
      "reward_std": 1.3562747240066528,
      "rewards/cargo_build_reward": 0.625,
      "rewards/cargo_clippy_reward": 0.59375,
      "rewards/cargo_test_reward": 0.1875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8671875,
      "step": 75
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1073.0,
      "completions/max_terminated_length": 1073.0,
      "completions/mean_length": 385.59375,
      "completions/mean_terminated_length": 385.59375,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "epoch": 0.03095723014256619,
      "grad_norm": 0.1384324357143004,
      "kl": 0.04718017578125,
      "learning_rate": 1e-06,
      "loss": 0.0235,
      "num_tokens": 2031031.0,
      "reward": 4.34375,
      "reward_std": 0.9096367955207825,
      "rewards/cargo_build_reward": 0.53125,
      "rewards/cargo_clippy_reward": 0.53125,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.5625,
      "step": 76
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 709.0,
      "completions/max_terminated_length": 709.0,
      "completions/mean_length": 277.6875,
      "completions/mean_terminated_length": 277.6875,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "epoch": 0.03136456211812627,
      "grad_norm": 0.07950834144105107,
      "kl": 0.06903076171875,
      "learning_rate": 1e-06,
      "loss": 0.0053,
      "num_tokens": 2054549.0,
      "reward": 5.2890625,
      "reward_std": 0.8124831914901733,
      "rewards/cargo_build_reward": 0.96875,
      "rewards/cargo_clippy_reward": 0.96875,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7890625,
      "step": 77
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 821.0,
      "completions/max_terminated_length": 821.0,
      "completions/mean_length": 406.21875,
      "completions/mean_terminated_length": 406.21875,
      "completions/min_length": 220.0,
      "completions/min_terminated_length": 220.0,
      "epoch": 0.03177189409368635,
      "grad_norm": 0.133902651952502,
      "kl": 0.0469970703125,
      "learning_rate": 1e-06,
      "loss": 0.0077,
      "num_tokens": 2081900.0,
      "reward": 3.5234375,
      "reward_std": 1.0582976341247559,
      "rewards/cargo_build_reward": 0.28125,
      "rewards/cargo_clippy_reward": 0.28125,
      "rewards/cargo_test_reward": 0.0625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8359375,
      "step": 78
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 819.0,
      "completions/max_terminated_length": 819.0,
      "completions/mean_length": 453.71875,
      "completions/mean_terminated_length": 453.71875,
      "completions/min_length": 241.0,
      "completions/min_terminated_length": 241.0,
      "epoch": 0.03217922606924643,
      "grad_norm": 0.08836244866516904,
      "kl": 0.04144287109375,
      "learning_rate": 1e-06,
      "loss": 0.0025,
      "num_tokens": 2111187.0,
      "reward": 3.5625,
      "reward_std": 0.8975033760070801,
      "rewards/cargo_build_reward": 0.375,
      "rewards/cargo_clippy_reward": 0.375,
      "rewards/cargo_test_reward": 0.125,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.609375,
      "step": 79
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 723.0,
      "completions/max_terminated_length": 723.0,
      "completions/mean_length": 400.09375,
      "completions/mean_terminated_length": 400.09375,
      "completions/min_length": 181.0,
      "completions/min_terminated_length": 181.0,
      "epoch": 0.032586558044806514,
      "grad_norm": 0.15207937990490714,
      "kl": 0.18634033203125,
      "learning_rate": 1e-06,
      "loss": -0.0043,
      "num_tokens": 2138638.0,
      "reward": 3.9765625,
      "reward_std": 1.2686288356781006,
      "rewards/cargo_build_reward": 0.53125,
      "rewards/cargo_clippy_reward": 0.53125,
      "rewards/cargo_test_reward": 0.09375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7265625,
      "step": 80
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 533.0,
      "completions/max_terminated_length": 533.0,
      "completions/mean_length": 300.40625,
      "completions/mean_terminated_length": 300.40625,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "epoch": 0.0329938900203666,
      "grad_norm": 0.10830167291849532,
      "kl": 0.064697265625,
      "learning_rate": 1e-06,
      "loss": -0.0046,
      "num_tokens": 2162715.0,
      "reward": 5.515625,
      "reward_std": 0.6978596448898315,
      "rewards/cargo_build_reward": 0.9375,
      "rewards/cargo_clippy_reward": 0.9375,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.890625,
      "step": 81
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 431.0,
      "completions/max_terminated_length": 431.0,
      "completions/mean_length": 291.90625,
      "completions/mean_terminated_length": 291.90625,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "epoch": 0.03340122199592668,
      "grad_norm": 0.11140370069913581,
      "kl": 0.40771484375,
      "learning_rate": 1e-06,
      "loss": 0.0024,
      "num_tokens": 2187448.0,
      "reward": 4.1875,
      "reward_std": 1.369483232498169,
      "rewards/cargo_build_reward": 0.46875,
      "rewards/cargo_clippy_reward": 0.46875,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.703125,
      "step": 82
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 680.0,
      "completions/max_terminated_length": 680.0,
      "completions/mean_length": 337.53125,
      "completions/mean_terminated_length": 337.53125,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "epoch": 0.033808553971486764,
      "grad_norm": 0.13283768108323413,
      "kl": 0.0582275390625,
      "learning_rate": 1e-06,
      "loss": 0.0048,
      "num_tokens": 2213129.0,
      "reward": 4.265625,
      "reward_std": 1.1658533811569214,
      "rewards/cargo_build_reward": 0.625,
      "rewards/cargo_clippy_reward": 0.625,
      "rewards/cargo_test_reward": 0.25,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.515625,
      "step": 83
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 797.0,
      "completions/max_terminated_length": 797.0,
      "completions/mean_length": 435.125,
      "completions/mean_terminated_length": 449.16129032258067,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 286.0,
      "epoch": 0.034215885947046845,
      "grad_norm": 0.12189998293491974,
      "kl": 0.12158203125,
      "learning_rate": 1e-06,
      "loss": -0.0031,
      "num_tokens": 2245827.0,
      "reward": 3.5625,
      "reward_std": 1.343709111213684,
      "rewards/cargo_build_reward": 0.375,
      "rewards/cargo_clippy_reward": 0.34375,
      "rewards/cargo_test_reward": 0.03125,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.9375,
      "rewards/tests_have_asserts_reward": 0.859375,
      "step": 84
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1060.0,
      "completions/max_terminated_length": 1060.0,
      "completions/mean_length": 320.21875,
      "completions/mean_terminated_length": 320.21875,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "epoch": 0.034623217922606926,
      "grad_norm": 0.10492427493641471,
      "kl": 0.0643310546875,
      "learning_rate": 1e-06,
      "loss": 0.0012,
      "num_tokens": 2270802.0,
      "reward": 5.015625,
      "reward_std": 1.1473376750946045,
      "rewards/cargo_build_reward": 0.75,
      "rewards/cargo_clippy_reward": 0.75,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.765625,
      "step": 85
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 881.0,
      "completions/max_terminated_length": 881.0,
      "completions/mean_length": 331.34375,
      "completions/mean_terminated_length": 331.34375,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "epoch": 0.03503054989816701,
      "grad_norm": 0.11451060016867645,
      "kl": 0.06793212890625,
      "learning_rate": 1e-06,
      "loss": -0.0082,
      "num_tokens": 2295861.0,
      "reward": 4.5390625,
      "reward_std": 1.3130030632019043,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.25,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.7421875,
      "step": 86
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 800.0,
      "completions/max_terminated_length": 800.0,
      "completions/mean_length": 389.65625,
      "completions/mean_terminated_length": 389.65625,
      "completions/min_length": 197.0,
      "completions/min_terminated_length": 197.0,
      "epoch": 0.03543788187372709,
      "grad_norm": 0.06681902057762526,
      "kl": 0.05255126953125,
      "learning_rate": 1e-06,
      "loss": 0.0026,
      "num_tokens": 2324050.0,
      "reward": 4.015625,
      "reward_std": 0.4756559729576111,
      "rewards/cargo_build_reward": 0.46875,
      "rewards/cargo_clippy_reward": 0.46875,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.515625,
      "step": 87
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 638.0,
      "completions/max_terminated_length": 638.0,
      "completions/mean_length": 330.5,
      "completions/mean_terminated_length": 330.5,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "epoch": 0.03584521384928717,
      "grad_norm": 0.12738046714303833,
      "kl": 0.0687255859375,
      "learning_rate": 1e-06,
      "loss": -0.0012,
      "num_tokens": 2349850.0,
      "reward": 4.4375,
      "reward_std": 1.194375991821289,
      "rewards/cargo_build_reward": 0.59375,
      "rewards/cargo_clippy_reward": 0.59375,
      "rewards/cargo_test_reward": 0.3125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.625,
      "step": 88
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1023.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 370.9375,
      "completions/mean_terminated_length": 370.9375,
      "completions/min_length": 88.0,
      "completions/min_terminated_length": 88.0,
      "epoch": 0.03625254582484725,
      "grad_norm": 0.07802640302364765,
      "kl": 0.0546875,
      "learning_rate": 1e-06,
      "loss": 0.0002,
      "num_tokens": 2376280.0,
      "reward": 5.1015625,
      "reward_std": 0.8754923343658447,
      "rewards/cargo_build_reward": 0.6875,
      "rewards/cargo_clippy_reward": 0.6875,
      "rewards/cargo_test_reward": 0.53125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.6640625,
      "step": 89
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 630.0,
      "completions/max_terminated_length": 630.0,
      "completions/mean_length": 389.5,
      "completions/mean_terminated_length": 389.5,
      "completions/min_length": 187.0,
      "completions/min_terminated_length": 187.0,
      "epoch": 0.03665987780040733,
      "grad_norm": 0.09057003503703503,
      "kl": 0.05352783203125,
      "learning_rate": 1e-06,
      "loss": -0.0053,
      "num_tokens": 2403880.0,
      "reward": 4.4296875,
      "reward_std": 0.9106569886207581,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.15625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8046875,
      "step": 90
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 544.0,
      "completions/max_terminated_length": 544.0,
      "completions/mean_length": 291.4375,
      "completions/mean_terminated_length": 291.4375,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "epoch": 0.03706720977596741,
      "grad_norm": 0.13704527818486306,
      "kl": 0.0699462890625,
      "learning_rate": 1e-06,
      "loss": -0.0005,
      "num_tokens": 2428718.0,
      "reward": 4.7890625,
      "reward_std": 1.4855718612670898,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.6171875,
      "step": 91
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 399.0,
      "completions/max_terminated_length": 399.0,
      "completions/mean_length": 256.5625,
      "completions/mean_terminated_length": 256.5625,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "epoch": 0.03747454175152749,
      "grad_norm": 0.08634017282421738,
      "kl": 0.109619140625,
      "learning_rate": 1e-06,
      "loss": -0.002,
      "num_tokens": 2451736.0,
      "reward": 5.5234375,
      "reward_std": 1.2359544038772583,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.46875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8984375,
      "step": 92
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 573.0,
      "completions/max_terminated_length": 573.0,
      "completions/mean_length": 329.53125,
      "completions/mean_terminated_length": 329.53125,
      "completions/min_length": 220.0,
      "completions/min_terminated_length": 220.0,
      "epoch": 0.037881873727087574,
      "grad_norm": 0.10753217283790532,
      "kl": 0.06671142578125,
      "learning_rate": 1e-06,
      "loss": -0.0011,
      "num_tokens": 2476761.0,
      "reward": 4.34375,
      "reward_std": 1.0839296579360962,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.78125,
      "step": 93
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 805.0,
      "completions/max_terminated_length": 805.0,
      "completions/mean_length": 372.09375,
      "completions/mean_terminated_length": 372.09375,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "epoch": 0.038289205702647655,
      "grad_norm": 0.10051808748142209,
      "kl": 0.059814453125,
      "learning_rate": 1e-06,
      "loss": -0.0007,
      "num_tokens": 2503772.0,
      "reward": 4.4765625,
      "reward_std": 0.886406421661377,
      "rewards/cargo_build_reward": 0.625,
      "rewards/cargo_clippy_reward": 0.625,
      "rewards/cargo_test_reward": 0.21875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7890625,
      "step": 94
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1232.0,
      "completions/max_terminated_length": 1232.0,
      "completions/mean_length": 536.03125,
      "completions/mean_terminated_length": 536.03125,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "epoch": 0.038696537678207736,
      "grad_norm": 0.14038449079335694,
      "kl": 0.0394287109375,
      "learning_rate": 1e-06,
      "loss": 0.0056,
      "num_tokens": 2535957.0,
      "reward": 3.6328125,
      "reward_std": 1.128450632095337,
      "rewards/cargo_build_reward": 0.40625,
      "rewards/cargo_clippy_reward": 0.40625,
      "rewards/cargo_test_reward": 0.09375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.6484375,
      "step": 95
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1085.0,
      "completions/max_terminated_length": 1085.0,
      "completions/mean_length": 388.4375,
      "completions/mean_terminated_length": 388.4375,
      "completions/min_length": 202.0,
      "completions/min_terminated_length": 202.0,
      "epoch": 0.039103869653767824,
      "grad_norm": 0.09311033485193937,
      "kl": 0.0550537109375,
      "learning_rate": 1e-06,
      "loss": 0.0038,
      "num_tokens": 2562459.0,
      "reward": 3.703125,
      "reward_std": 0.8902084827423096,
      "rewards/cargo_build_reward": 0.40625,
      "rewards/cargo_clippy_reward": 0.40625,
      "rewards/cargo_test_reward": 0.0,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.890625,
      "step": 96
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1293.0,
      "completions/max_terminated_length": 1293.0,
      "completions/mean_length": 422.40625,
      "completions/mean_terminated_length": 422.40625,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "epoch": 0.039511201629327905,
      "grad_norm": 0.09004957679418751,
      "kl": 0.0880126953125,
      "learning_rate": 1e-06,
      "loss": 0.0079,
      "num_tokens": 2591568.0,
      "reward": 3.609375,
      "reward_std": 0.8331196904182434,
      "rewards/cargo_build_reward": 0.4375,
      "rewards/cargo_clippy_reward": 0.4375,
      "rewards/cargo_test_reward": 0.15625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.4375,
      "step": 97
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 875.0,
      "completions/max_terminated_length": 875.0,
      "completions/mean_length": 362.4375,
      "completions/mean_terminated_length": 362.4375,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "epoch": 0.039918533604887986,
      "grad_norm": 0.06577931737389649,
      "kl": 0.59649658203125,
      "learning_rate": 1e-06,
      "loss": -0.0015,
      "num_tokens": 2618342.0,
      "reward": 4.0,
      "reward_std": 0.5188412666320801,
      "rewards/cargo_build_reward": 0.46875,
      "rewards/cargo_clippy_reward": 0.46875,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.9375,
      "rewards/tests_have_asserts_reward": 0.578125,
      "step": 98
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 559.0,
      "completions/max_terminated_length": 559.0,
      "completions/mean_length": 356.78125,
      "completions/mean_terminated_length": 356.78125,
      "completions/min_length": 217.0,
      "completions/min_terminated_length": 217.0,
      "epoch": 0.04032586558044807,
      "grad_norm": 0.054377706261197915,
      "kl": 0.058837890625,
      "learning_rate": 1e-06,
      "loss": 0.0006,
      "num_tokens": 2645143.0,
      "reward": 3.5546875,
      "reward_std": 0.5849505662918091,
      "rewards/cargo_build_reward": 0.375,
      "rewards/cargo_clippy_reward": 0.375,
      "rewards/cargo_test_reward": 0.15625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.4921875,
      "step": 99
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 605.0,
      "completions/max_terminated_length": 605.0,
      "completions/mean_length": 354.625,
      "completions/mean_terminated_length": 354.625,
      "completions/min_length": 199.0,
      "completions/min_terminated_length": 199.0,
      "epoch": 0.04073319755600815,
      "grad_norm": 0.139261212805125,
      "kl": 0.05682373046875,
      "learning_rate": 1e-06,
      "loss": 0.0034,
      "num_tokens": 2671251.0,
      "reward": 3.8125,
      "reward_std": 1.3117740154266357,
      "rewards/cargo_build_reward": 0.5,
      "rewards/cargo_clippy_reward": 0.5,
      "rewards/cargo_test_reward": 0.15625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.5,
      "step": 100
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 780.0,
      "completions/max_terminated_length": 780.0,
      "completions/mean_length": 409.3125,
      "completions/mean_terminated_length": 409.3125,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "epoch": 0.04114052953156823,
      "grad_norm": 0.09072193629484701,
      "kl": 0.0150299072265625,
      "learning_rate": 1e-06,
      "loss": 0.0014,
      "num_tokens": 2699653.0,
      "reward": 3.9375,
      "reward_std": 0.8187613487243652,
      "rewards/cargo_build_reward": 0.40625,
      "rewards/cargo_clippy_reward": 0.40625,
      "rewards/cargo_test_reward": 0.21875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.703125,
      "step": 101
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 692.0,
      "completions/max_terminated_length": 692.0,
      "completions/mean_length": 380.53125,
      "completions/mean_terminated_length": 380.53125,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "epoch": 0.04154786150712831,
      "grad_norm": 0.08558946344667806,
      "kl": 0.016571044921875,
      "learning_rate": 1e-06,
      "loss": 0.0027,
      "num_tokens": 2727030.0,
      "reward": 4.828125,
      "reward_std": 0.8606547117233276,
      "rewards/cargo_build_reward": 0.6875,
      "rewards/cargo_clippy_reward": 0.6875,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.890625,
      "step": 102
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 649.0,
      "completions/max_terminated_length": 649.0,
      "completions/mean_length": 304.53125,
      "completions/mean_terminated_length": 304.53125,
      "completions/min_length": 65.0,
      "completions/min_terminated_length": 65.0,
      "epoch": 0.04195519348268839,
      "grad_norm": 0.08305197700331375,
      "kl": 0.020263671875,
      "learning_rate": 1e-06,
      "loss": 0.0061,
      "num_tokens": 2752087.0,
      "reward": 5.3984375,
      "reward_std": 0.8094767928123474,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.5,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7734375,
      "step": 103
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 617.0,
      "completions/max_terminated_length": 617.0,
      "completions/mean_length": 413.5,
      "completions/mean_terminated_length": 413.5,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "epoch": 0.04236252545824847,
      "grad_norm": 0.11857185466291319,
      "kl": 0.0153045654296875,
      "learning_rate": 1e-06,
      "loss": -0.0022,
      "num_tokens": 2780375.0,
      "reward": 4.1953125,
      "reward_std": 1.06247878074646,
      "rewards/cargo_build_reward": 0.625,
      "rewards/cargo_clippy_reward": 0.625,
      "rewards/cargo_test_reward": 0.0625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8203125,
      "step": 104
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1137.0,
      "completions/max_terminated_length": 1137.0,
      "completions/mean_length": 517.5,
      "completions/mean_terminated_length": 517.5,
      "completions/min_length": 259.0,
      "completions/min_terminated_length": 259.0,
      "epoch": 0.04276985743380855,
      "grad_norm": 0.13490652078592621,
      "kl": 0.011444091796875,
      "learning_rate": 1e-06,
      "loss": 0.0152,
      "num_tokens": 2811935.0,
      "reward": 3.5703125,
      "reward_std": 1.3479619026184082,
      "rewards/cargo_build_reward": 0.34375,
      "rewards/cargo_clippy_reward": 0.34375,
      "rewards/cargo_test_reward": 0.125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.6328125,
      "step": 105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 484.0,
      "completions/max_terminated_length": 484.0,
      "completions/mean_length": 320.78125,
      "completions/mean_terminated_length": 320.78125,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "epoch": 0.043177189409368634,
      "grad_norm": 0.0943869231894059,
      "kl": 0.01837158203125,
      "learning_rate": 1e-06,
      "loss": 0.0005,
      "num_tokens": 2837240.0,
      "reward": 3.703125,
      "reward_std": 0.8034542798995972,
      "rewards/cargo_build_reward": 0.40625,
      "rewards/cargo_clippy_reward": 0.375,
      "rewards/cargo_test_reward": 0.0,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.921875,
      "step": 106
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 734.0,
      "completions/max_terminated_length": 734.0,
      "completions/mean_length": 314.59375,
      "completions/mean_terminated_length": 314.59375,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "epoch": 0.043584521384928715,
      "grad_norm": 0.08439674479274882,
      "kl": 0.0181884765625,
      "learning_rate": 1e-06,
      "loss": 0.0063,
      "num_tokens": 2862835.0,
      "reward": 4.9765625,
      "reward_std": 0.824439287185669,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.46875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7265625,
      "step": 107
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 695.0,
      "completions/max_terminated_length": 695.0,
      "completions/mean_length": 399.0,
      "completions/mean_terminated_length": 399.0,
      "completions/min_length": 187.0,
      "completions/min_terminated_length": 187.0,
      "epoch": 0.043991853360488796,
      "grad_norm": 0.10557785012511325,
      "kl": 0.0165557861328125,
      "learning_rate": 1e-06,
      "loss": -0.0059,
      "num_tokens": 2891315.0,
      "reward": 4.3046875,
      "reward_std": 1.1515986919403076,
      "rewards/cargo_build_reward": 0.59375,
      "rewards/cargo_clippy_reward": 0.59375,
      "rewards/cargo_test_reward": 0.09375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9296875,
      "step": 108
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 473.0,
      "completions/max_terminated_length": 473.0,
      "completions/mean_length": 260.4375,
      "completions/mean_terminated_length": 260.4375,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "epoch": 0.04439918533604888,
      "grad_norm": 0.10739264731467306,
      "kl": 0.155548095703125,
      "learning_rate": 1e-06,
      "loss": -0.0048,
      "num_tokens": 2915001.0,
      "reward": 5.0625,
      "reward_std": 1.2184569835662842,
      "rewards/cargo_build_reward": 0.78125,
      "rewards/cargo_clippy_reward": 0.78125,
      "rewards/cargo_test_reward": 0.40625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.703125,
      "step": 109
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1135.0,
      "completions/max_terminated_length": 1135.0,
      "completions/mean_length": 423.625,
      "completions/mean_terminated_length": 423.625,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "epoch": 0.04480651731160896,
      "grad_norm": 0.11203578083151883,
      "kl": 0.015380859375,
      "learning_rate": 1e-06,
      "loss": 0.0018,
      "num_tokens": 2943325.0,
      "reward": 3.921875,
      "reward_std": 1.066467523574829,
      "rewards/cargo_build_reward": 0.5,
      "rewards/cargo_clippy_reward": 0.5,
      "rewards/cargo_test_reward": 0.125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.6875,
      "step": 110
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1089.0,
      "completions/max_terminated_length": 1089.0,
      "completions/mean_length": 472.90625,
      "completions/mean_terminated_length": 472.90625,
      "completions/min_length": 211.0,
      "completions/min_terminated_length": 211.0,
      "epoch": 0.045213849287169046,
      "grad_norm": 0.11142257826861593,
      "kl": 0.0133209228515625,
      "learning_rate": 1e-06,
      "loss": 0.0022,
      "num_tokens": 2973514.0,
      "reward": 3.6171875,
      "reward_std": 0.9596100449562073,
      "rewards/cargo_build_reward": 0.34375,
      "rewards/cargo_clippy_reward": 0.34375,
      "rewards/cargo_test_reward": 0.0,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9296875,
      "step": 111
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 633.0,
      "completions/max_terminated_length": 633.0,
      "completions/mean_length": 334.59375,
      "completions/mean_terminated_length": 334.59375,
      "completions/min_length": 187.0,
      "completions/min_terminated_length": 187.0,
      "epoch": 0.04562118126272913,
      "grad_norm": 0.12774013251987382,
      "kl": 0.01898193359375,
      "learning_rate": 1e-06,
      "loss": -0.0067,
      "num_tokens": 2999165.0,
      "reward": 3.9921875,
      "reward_std": 1.0958621501922607,
      "rewards/cargo_build_reward": 0.5625,
      "rewards/cargo_clippy_reward": 0.5625,
      "rewards/cargo_test_reward": 0.09375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.6796875,
      "step": 112
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2746.0,
      "completions/max_terminated_length": 2746.0,
      "completions/mean_length": 454.71875,
      "completions/mean_terminated_length": 454.71875,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "epoch": 0.04602851323828921,
      "grad_norm": 0.11556314604735626,
      "kl": 0.016754150390625,
      "learning_rate": 1e-06,
      "loss": -0.0358,
      "num_tokens": 3029132.0,
      "reward": 4.8828125,
      "reward_std": 0.7093173265457153,
      "rewards/cargo_build_reward": 0.625,
      "rewards/cargo_clippy_reward": 0.625,
      "rewards/cargo_test_reward": 0.5,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.6796875,
      "step": 113
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 494.0,
      "completions/max_terminated_length": 494.0,
      "completions/mean_length": 312.5,
      "completions/mean_terminated_length": 312.5,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "epoch": 0.04643584521384929,
      "grad_norm": 0.12511619983156722,
      "kl": 0.01885986328125,
      "learning_rate": 1e-06,
      "loss": 0.0068,
      "num_tokens": 3054100.0,
      "reward": 3.9375,
      "reward_std": 1.0332987308502197,
      "rewards/cargo_build_reward": 0.5625,
      "rewards/cargo_clippy_reward": 0.5625,
      "rewards/cargo_test_reward": 0.03125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.75,
      "step": 114
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2299.0,
      "completions/max_terminated_length": 2299.0,
      "completions/mean_length": 482.34375,
      "completions/mean_terminated_length": 482.34375,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "epoch": 0.04684317718940937,
      "grad_norm": 0.15488648519429415,
      "kl": 0.014068603515625,
      "learning_rate": 1e-06,
      "loss": 0.0049,
      "num_tokens": 3084839.0,
      "reward": 3.8828125,
      "reward_std": 1.3750832080841064,
      "rewards/cargo_build_reward": 0.40625,
      "rewards/cargo_clippy_reward": 0.40625,
      "rewards/cargo_test_reward": 0.15625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7578125,
      "step": 115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 957.0,
      "completions/max_terminated_length": 957.0,
      "completions/mean_length": 339.0,
      "completions/mean_terminated_length": 339.0,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "epoch": 0.04725050916496945,
      "grad_norm": 0.11958016309066218,
      "kl": 0.0204925537109375,
      "learning_rate": 1e-06,
      "loss": 0.0004,
      "num_tokens": 3110687.0,
      "reward": 4.6875,
      "reward_std": 1.2157671451568604,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8125,
      "step": 116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 753.0,
      "completions/max_terminated_length": 753.0,
      "completions/mean_length": 477.84375,
      "completions/mean_terminated_length": 477.84375,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 205.0,
      "epoch": 0.04765784114052953,
      "grad_norm": 0.13733313268090042,
      "kl": 0.01507568359375,
      "learning_rate": 1e-06,
      "loss": 0.0029,
      "num_tokens": 3141714.0,
      "reward": 3.4921875,
      "reward_std": 1.2556400299072266,
      "rewards/cargo_build_reward": 0.34375,
      "rewards/cargo_clippy_reward": 0.34375,
      "rewards/cargo_test_reward": 0.09375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.6171875,
      "step": 117
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 942.0,
      "completions/max_terminated_length": 942.0,
      "completions/mean_length": 414.28125,
      "completions/mean_terminated_length": 414.28125,
      "completions/min_length": 202.0,
      "completions/min_terminated_length": 202.0,
      "epoch": 0.04806517311608961,
      "grad_norm": 0.11384370016057642,
      "kl": 0.01507568359375,
      "learning_rate": 1e-06,
      "loss": 0.0081,
      "num_tokens": 3170355.0,
      "reward": 3.96875,
      "reward_std": 0.9767351150512695,
      "rewards/cargo_build_reward": 0.46875,
      "rewards/cargo_clippy_reward": 0.46875,
      "rewards/cargo_test_reward": 0.125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.78125,
      "step": 118
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 816.0,
      "completions/max_terminated_length": 816.0,
      "completions/mean_length": 499.59375,
      "completions/mean_terminated_length": 499.59375,
      "completions/min_length": 250.0,
      "completions/min_terminated_length": 250.0,
      "epoch": 0.048472505091649694,
      "grad_norm": 0.192732149328862,
      "kl": 0.035491943359375,
      "learning_rate": 1e-06,
      "loss": -0.0033,
      "num_tokens": 3201262.0,
      "reward": 3.515625,
      "reward_std": 1.357240915298462,
      "rewards/cargo_build_reward": 0.375,
      "rewards/cargo_clippy_reward": 0.375,
      "rewards/cargo_test_reward": 0.125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.515625,
      "step": 119
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1158.0,
      "completions/max_terminated_length": 1158.0,
      "completions/mean_length": 353.4375,
      "completions/mean_terminated_length": 353.4375,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 205.0,
      "epoch": 0.048879837067209775,
      "grad_norm": 0.090119599820571,
      "kl": 0.019805908203125,
      "learning_rate": 1e-06,
      "loss": 0.0001,
      "num_tokens": 3228220.0,
      "reward": 4.90625,
      "reward_std": 1.0091047286987305,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.21875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.71875,
      "step": 120
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 891.0,
      "completions/max_terminated_length": 891.0,
      "completions/mean_length": 382.375,
      "completions/mean_terminated_length": 382.375,
      "completions/min_length": 202.0,
      "completions/min_terminated_length": 202.0,
      "epoch": 0.049287169042769856,
      "grad_norm": 0.12271168318363614,
      "kl": 0.023101806640625,
      "learning_rate": 1e-06,
      "loss": -0.0033,
      "num_tokens": 3255104.0,
      "reward": 4.34375,
      "reward_std": 1.167395830154419,
      "rewards/cargo_build_reward": 0.5625,
      "rewards/cargo_clippy_reward": 0.5625,
      "rewards/cargo_test_reward": 0.15625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.921875,
      "step": 121
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1436.0,
      "completions/max_terminated_length": 1436.0,
      "completions/mean_length": 459.75,
      "completions/mean_terminated_length": 474.5806451612903,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 252.0,
      "epoch": 0.04969450101832994,
      "grad_norm": 0.09261390577655294,
      "kl": 0.0178375244140625,
      "learning_rate": 1e-06,
      "loss": 0.0002,
      "num_tokens": 3287877.0,
      "reward": 3.421875,
      "reward_std": 0.9010084867477417,
      "rewards/cargo_build_reward": 0.3125,
      "rewards/cargo_clippy_reward": 0.3125,
      "rewards/cargo_test_reward": 0.0,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.9375,
      "rewards/tests_have_asserts_reward": 0.875,
      "step": 122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 735.0,
      "completions/max_terminated_length": 735.0,
      "completions/mean_length": 448.9375,
      "completions/mean_terminated_length": 448.9375,
      "completions/min_length": 192.0,
      "completions/min_terminated_length": 192.0,
      "epoch": 0.05010183299389002,
      "grad_norm": 0.11138776361064398,
      "kl": 0.018585205078125,
      "learning_rate": 1e-06,
      "loss": -0.0001,
      "num_tokens": 3317739.0,
      "reward": 3.8828125,
      "reward_std": 1.1341464519500732,
      "rewards/cargo_build_reward": 0.6875,
      "rewards/cargo_clippy_reward": 0.5625,
      "rewards/cargo_test_reward": 0.0625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.5078125,
      "step": 123
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 464.0,
      "completions/max_terminated_length": 464.0,
      "completions/mean_length": 277.53125,
      "completions/mean_terminated_length": 286.48387096774195,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 113.0,
      "epoch": 0.0505091649694501,
      "grad_norm": 0.17014362985083628,
      "kl": 1.129119873046875,
      "learning_rate": 1e-06,
      "loss": -0.016,
      "num_tokens": 3345155.0,
      "reward": 4.921875,
      "reward_std": 1.6797964572906494,
      "rewards/cargo_build_reward": 0.78125,
      "rewards/cargo_clippy_reward": 0.78125,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.9375,
      "rewards/test_block_count_reward": 0.875,
      "rewards/tests_have_asserts_reward": 0.75,
      "step": 124
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1349.0,
      "completions/max_terminated_length": 1349.0,
      "completions/mean_length": 545.5625,
      "completions/mean_terminated_length": 545.5625,
      "completions/min_length": 255.0,
      "completions/min_terminated_length": 255.0,
      "epoch": 0.05091649694501019,
      "grad_norm": 0.12622940865825613,
      "kl": 0.0236358642578125,
      "learning_rate": 1e-06,
      "loss": 0.0141,
      "num_tokens": 3378373.0,
      "reward": 4.0859375,
      "reward_std": 1.0542640686035156,
      "rewards/cargo_build_reward": 0.625,
      "rewards/cargo_clippy_reward": 0.59375,
      "rewards/cargo_test_reward": 0.0625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7421875,
      "step": 125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 499.0,
      "completions/max_terminated_length": 499.0,
      "completions/mean_length": 278.90625,
      "completions/mean_terminated_length": 278.90625,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "epoch": 0.05132382892057027,
      "grad_norm": 0.08852836024214183,
      "kl": 0.04669189453125,
      "learning_rate": 1e-06,
      "loss": 0.0012,
      "num_tokens": 3401898.0,
      "reward": 5.5859375,
      "reward_std": 1.0553646087646484,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.5,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8359375,
      "step": 126
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 523.0,
      "completions/max_terminated_length": 523.0,
      "completions/mean_length": 257.625,
      "completions/mean_terminated_length": 257.625,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "epoch": 0.05173116089613035,
      "grad_norm": 0.09176336366373557,
      "kl": 0.058837890625,
      "learning_rate": 1e-06,
      "loss": -0.001,
      "num_tokens": 3425278.0,
      "reward": 5.8359375,
      "reward_std": 0.834545373916626,
      "rewards/cargo_build_reward": 0.96875,
      "rewards/cargo_clippy_reward": 0.96875,
      "rewards/cargo_test_reward": 0.59375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7109375,
      "step": 127
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1231.0,
      "completions/max_terminated_length": 1231.0,
      "completions/mean_length": 451.875,
      "completions/mean_terminated_length": 451.875,
      "completions/min_length": 213.0,
      "completions/min_terminated_length": 213.0,
      "epoch": 0.05213849287169043,
      "grad_norm": 0.12774418089567324,
      "kl": 0.02215576171875,
      "learning_rate": 1e-06,
      "loss": -0.0009,
      "num_tokens": 3455458.0,
      "reward": 4.796875,
      "reward_std": 1.1469030380249023,
      "rewards/cargo_build_reward": 0.75,
      "rewards/cargo_clippy_reward": 0.75,
      "rewards/cargo_test_reward": 0.1875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.921875,
      "step": 128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 712.0,
      "completions/max_terminated_length": 712.0,
      "completions/mean_length": 378.75,
      "completions/mean_terminated_length": 378.75,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "epoch": 0.05254582484725051,
      "grad_norm": 0.0980242741140148,
      "kl": 0.0496826171875,
      "learning_rate": 1e-06,
      "loss": -0.0004,
      "num_tokens": 3482530.0,
      "reward": 5.0390625,
      "reward_std": 0.8885169625282288,
      "rewards/cargo_build_reward": 0.71875,
      "rewards/cargo_clippy_reward": 0.71875,
      "rewards/cargo_test_reward": 0.4375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7265625,
      "step": 129
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 772.0,
      "completions/max_terminated_length": 772.0,
      "completions/mean_length": 373.03125,
      "completions/mean_terminated_length": 373.03125,
      "completions/min_length": 188.0,
      "completions/min_terminated_length": 188.0,
      "epoch": 0.05295315682281059,
      "grad_norm": 0.12695718263684108,
      "kl": 0.030242919921875,
      "learning_rate": 1e-06,
      "loss": 0.0026,
      "num_tokens": 3509611.0,
      "reward": 4.484375,
      "reward_std": 1.4467928409576416,
      "rewards/cargo_build_reward": 0.59375,
      "rewards/cargo_clippy_reward": 0.59375,
      "rewards/cargo_test_reward": 0.25,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.796875,
      "step": 130
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 593.0,
      "completions/max_terminated_length": 593.0,
      "completions/mean_length": 353.21875,
      "completions/mean_terminated_length": 353.21875,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "epoch": 0.05336048879837067,
      "grad_norm": 0.12938313975955237,
      "kl": 0.029388427734375,
      "learning_rate": 1e-06,
      "loss": -0.0038,
      "num_tokens": 3536002.0,
      "reward": 4.4296875,
      "reward_std": 1.170635461807251,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.15625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8046875,
      "step": 131
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 638.0,
      "completions/max_terminated_length": 638.0,
      "completions/mean_length": 350.0,
      "completions/mean_terminated_length": 350.0,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "epoch": 0.053767820773930754,
      "grad_norm": 0.12555362474992476,
      "kl": 0.203582763671875,
      "learning_rate": 1e-06,
      "loss": -0.007,
      "num_tokens": 3563258.0,
      "reward": 5.046875,
      "reward_std": 1.3916959762573242,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.5,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.375,
      "step": 132
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 698.0,
      "completions/max_terminated_length": 698.0,
      "completions/mean_length": 352.90625,
      "completions/mean_terminated_length": 352.90625,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "epoch": 0.054175152749490835,
      "grad_norm": 0.12782522321914772,
      "kl": 0.0283203125,
      "learning_rate": 1e-06,
      "loss": -0.0009,
      "num_tokens": 3589367.0,
      "reward": 4.6796875,
      "reward_std": 1.2642593383789062,
      "rewards/cargo_build_reward": 0.71875,
      "rewards/cargo_clippy_reward": 0.71875,
      "rewards/cargo_test_reward": 0.1875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8671875,
      "step": 133
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 616.0,
      "completions/max_terminated_length": 616.0,
      "completions/mean_length": 360.25,
      "completions/mean_terminated_length": 360.25,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "epoch": 0.054582484725050916,
      "grad_norm": 0.10633934376190354,
      "kl": 0.201812744140625,
      "learning_rate": 1e-06,
      "loss": -0.0015,
      "num_tokens": 3616071.0,
      "reward": 4.3046875,
      "reward_std": 0.8920255303382874,
      "rewards/cargo_build_reward": 0.71875,
      "rewards/cargo_clippy_reward": 0.71875,
      "rewards/cargo_test_reward": 0.0625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7421875,
      "step": 134
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 341.0,
      "completions/max_terminated_length": 341.0,
      "completions/mean_length": 255.8125,
      "completions/mean_terminated_length": 255.8125,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "epoch": 0.054989816700611,
      "grad_norm": 0.09032036141210152,
      "kl": 0.068634033203125,
      "learning_rate": 1e-06,
      "loss": 0.0005,
      "num_tokens": 3639481.0,
      "reward": 5.3046875,
      "reward_std": 0.9606647491455078,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.7578125,
      "step": 135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1321.0,
      "completions/max_terminated_length": 1321.0,
      "completions/mean_length": 560.0625,
      "completions/mean_terminated_length": 560.0625,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "epoch": 0.05539714867617108,
      "grad_norm": 0.11312472825908094,
      "kl": 0.026641845703125,
      "learning_rate": 1e-06,
      "loss": -0.0026,
      "num_tokens": 3672323.0,
      "reward": 3.8359375,
      "reward_std": 1.0763511657714844,
      "rewards/cargo_build_reward": 0.34375,
      "rewards/cargo_clippy_reward": 0.34375,
      "rewards/cargo_test_reward": 0.1875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.7890625,
      "step": 136
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 639.0,
      "completions/max_terminated_length": 639.0,
      "completions/mean_length": 328.46875,
      "completions/mean_terminated_length": 328.46875,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "epoch": 0.05580448065173116,
      "grad_norm": 0.1488737904564473,
      "kl": 1.011474609375,
      "learning_rate": 1e-06,
      "loss": 0.0001,
      "num_tokens": 3697154.0,
      "reward": 5.15625,
      "reward_std": 1.3229000568389893,
      "rewards/cargo_build_reward": 0.78125,
      "rewards/cargo_clippy_reward": 0.78125,
      "rewards/cargo_test_reward": 0.40625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.78125,
      "step": 137
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 880.0,
      "completions/max_terminated_length": 880.0,
      "completions/mean_length": 465.53125,
      "completions/mean_terminated_length": 465.53125,
      "completions/min_length": 227.0,
      "completions/min_terminated_length": 227.0,
      "epoch": 0.05621181262729124,
      "grad_norm": 0.11183649813224111,
      "kl": 0.04339599609375,
      "learning_rate": 1e-06,
      "loss": 0.0037,
      "num_tokens": 3727339.0,
      "reward": 4.5078125,
      "reward_std": 0.8816056251525879,
      "rewards/cargo_build_reward": 0.5625,
      "rewards/cargo_clippy_reward": 0.5625,
      "rewards/cargo_test_reward": 0.21875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9453125,
      "step": 138
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1028.0,
      "completions/max_terminated_length": 1028.0,
      "completions/mean_length": 449.1875,
      "completions/mean_terminated_length": 495.6551724137931,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 198.0,
      "epoch": 0.05661914460285132,
      "grad_norm": 0.2009998859785612,
      "kl": 0.022979736328125,
      "learning_rate": 1e-06,
      "loss": -0.0419,
      "num_tokens": 3767171.0,
      "reward": 3.6171875,
      "reward_std": 1.3957762718200684,
      "rewards/cargo_build_reward": 0.53125,
      "rewards/cargo_clippy_reward": 0.53125,
      "rewards/cargo_test_reward": 0.03125,
      "rewards/code_block_count_reward": 0.90625,
      "rewards/non_empty_reward": 0.90625,
      "rewards/test_block_count_reward": 0.90625,
      "rewards/tests_have_asserts_reward": 0.6796875,
      "step": 139
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 676.0,
      "completions/max_terminated_length": 676.0,
      "completions/mean_length": 368.53125,
      "completions/mean_terminated_length": 380.4193548387097,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 201.0,
      "epoch": 0.05702647657841141,
      "grad_norm": 0.1257378479708751,
      "kl": 0.02899169921875,
      "learning_rate": 1e-06,
      "loss": -0.0132,
      "num_tokens": 3797809.0,
      "reward": 5.1640625,
      "reward_std": 1.1951626539230347,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.8515625,
      "step": 140
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1415.0,
      "completions/max_terminated_length": 1415.0,
      "completions/mean_length": 429.625,
      "completions/mean_terminated_length": 429.625,
      "completions/min_length": 59.0,
      "completions/min_terminated_length": 59.0,
      "epoch": 0.05743380855397149,
      "grad_norm": 0.1312714485926572,
      "kl": 0.04241943359375,
      "learning_rate": 1e-06,
      "loss": 0.0111,
      "num_tokens": 3826541.0,
      "reward": 4.8984375,
      "reward_std": 1.2038742303848267,
      "rewards/cargo_build_reward": 0.6875,
      "rewards/cargo_clippy_reward": 0.6875,
      "rewards/cargo_test_reward": 0.4375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.6484375,
      "step": 141
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 639.0,
      "completions/max_terminated_length": 639.0,
      "completions/mean_length": 365.15625,
      "completions/mean_terminated_length": 365.15625,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "epoch": 0.05784114052953157,
      "grad_norm": 0.1220036429767914,
      "kl": 0.042510986328125,
      "learning_rate": 1e-06,
      "loss": 0.0049,
      "num_tokens": 3852890.0,
      "reward": 5.2265625,
      "reward_std": 1.383144736289978,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.40625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7890625,
      "step": 142
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 352.0,
      "completions/max_terminated_length": 352.0,
      "completions/mean_length": 244.9375,
      "completions/mean_terminated_length": 244.9375,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "epoch": 0.05824847250509165,
      "grad_norm": 0.13772309161184693,
      "kl": 0.0450439453125,
      "learning_rate": 1e-06,
      "loss": 0.0075,
      "num_tokens": 3875944.0,
      "reward": 5.8125,
      "reward_std": 1.4358084201812744,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.6875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.703125,
      "step": 143
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 556.0,
      "completions/max_terminated_length": 556.0,
      "completions/mean_length": 348.71875,
      "completions/mean_terminated_length": 359.96774193548384,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 192.0,
      "epoch": 0.05865580448065173,
      "grad_norm": 0.13794321630421855,
      "kl": 0.03729248046875,
      "learning_rate": 1e-06,
      "loss": -0.0147,
      "num_tokens": 3906347.0,
      "reward": 4.65625,
      "reward_std": 1.218233346939087,
      "rewards/cargo_build_reward": 0.75,
      "rewards/cargo_clippy_reward": 0.71875,
      "rewards/cargo_test_reward": 0.21875,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.8125,
      "step": 144
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 510.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 292.625,
      "completions/mean_terminated_length": 292.625,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "epoch": 0.059063136456211814,
      "grad_norm": 0.13730681937853773,
      "kl": 0.04180908203125,
      "learning_rate": 1e-06,
      "loss": -0.0029,
      "num_tokens": 3931079.0,
      "reward": 4.921875,
      "reward_std": 1.142471432685852,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.3125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.671875,
      "step": 145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 517.0,
      "completions/max_terminated_length": 517.0,
      "completions/mean_length": 290.4375,
      "completions/mean_terminated_length": 299.80645161290323,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 175.0,
      "epoch": 0.059470468431771895,
      "grad_norm": 0.14981601476659048,
      "kl": 0.05426025390625,
      "learning_rate": 1e-06,
      "loss": 0.0003,
      "num_tokens": 3959162.0,
      "reward": 4.734375,
      "reward_std": 1.2687596082687378,
      "rewards/cargo_build_reward": 0.75,
      "rewards/cargo_clippy_reward": 0.75,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.734375,
      "step": 146
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 707.0,
      "completions/max_terminated_length": 707.0,
      "completions/mean_length": 274.125,
      "completions/mean_terminated_length": 274.125,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "epoch": 0.059877800407331976,
      "grad_norm": 0.13171278590355248,
      "kl": 0.03729248046875,
      "learning_rate": 1e-06,
      "loss": 0.0037,
      "num_tokens": 3982822.0,
      "reward": 4.734375,
      "reward_std": 1.3779085874557495,
      "rewards/cargo_build_reward": 0.6875,
      "rewards/cargo_clippy_reward": 0.6875,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.609375,
      "step": 147
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1145.0,
      "completions/max_terminated_length": 1145.0,
      "completions/mean_length": 370.4375,
      "completions/mean_terminated_length": 370.4375,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "epoch": 0.06028513238289206,
      "grad_norm": 0.06528599734141968,
      "kl": 0.09295654296875,
      "learning_rate": 1e-06,
      "loss": 0.0075,
      "num_tokens": 4009284.0,
      "reward": 4.484375,
      "reward_std": 0.6813257336616516,
      "rewards/cargo_build_reward": 0.59375,
      "rewards/cargo_clippy_reward": 0.59375,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.546875,
      "step": 148
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 551.0,
      "completions/max_terminated_length": 551.0,
      "completions/mean_length": 301.625,
      "completions/mean_terminated_length": 301.625,
      "completions/min_length": 61.0,
      "completions/min_terminated_length": 61.0,
      "epoch": 0.06069246435845214,
      "grad_norm": 0.14054431347384852,
      "kl": 0.40924072265625,
      "learning_rate": 1e-06,
      "loss": 0.0026,
      "num_tokens": 4034544.0,
      "reward": 4.7734375,
      "reward_std": 1.5535788536071777,
      "rewards/cargo_build_reward": 0.71875,
      "rewards/cargo_clippy_reward": 0.71875,
      "rewards/cargo_test_reward": 0.3125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.7265625,
      "step": 149
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 494.0,
      "completions/max_terminated_length": 494.0,
      "completions/mean_length": 289.3125,
      "completions/mean_terminated_length": 289.3125,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "epoch": 0.06109979633401222,
      "grad_norm": 0.10053438279511238,
      "kl": 0.16851806640625,
      "learning_rate": 1e-06,
      "loss": 0.0028,
      "num_tokens": 4058338.0,
      "reward": 5.328125,
      "reward_std": 1.0581912994384766,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.953125,
      "step": 150
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 521.0,
      "completions/max_terminated_length": 521.0,
      "completions/mean_length": 276.28125,
      "completions/mean_terminated_length": 276.28125,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "epoch": 0.0615071283095723,
      "grad_norm": 0.0999585842466013,
      "kl": 0.150634765625,
      "learning_rate": 1e-06,
      "loss": -0.0001,
      "num_tokens": 4082147.0,
      "reward": 5.703125,
      "reward_std": 0.9247983694076538,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.5,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.953125,
      "step": 151
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 914.0,
      "completions/max_terminated_length": 914.0,
      "completions/mean_length": 479.0,
      "completions/mean_terminated_length": 479.0,
      "completions/min_length": 248.0,
      "completions/min_terminated_length": 248.0,
      "epoch": 0.06191446028513238,
      "grad_norm": 0.13048559366632506,
      "kl": 0.03289794921875,
      "learning_rate": 1e-06,
      "loss": -0.0008,
      "num_tokens": 4112499.0,
      "reward": 4.4453125,
      "reward_std": 1.2833878993988037,
      "rewards/cargo_build_reward": 0.53125,
      "rewards/cargo_clippy_reward": 0.53125,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8203125,
      "step": 152
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1075.0,
      "completions/max_terminated_length": 1075.0,
      "completions/mean_length": 480.875,
      "completions/mean_terminated_length": 480.875,
      "completions/min_length": 264.0,
      "completions/min_terminated_length": 264.0,
      "epoch": 0.06232179226069246,
      "grad_norm": 0.06546655673170758,
      "kl": 0.120361328125,
      "learning_rate": 1e-06,
      "loss": -0.0033,
      "num_tokens": 4142655.0,
      "reward": 4.1015625,
      "reward_std": 0.5515722632408142,
      "rewards/cargo_build_reward": 0.59375,
      "rewards/cargo_clippy_reward": 0.5625,
      "rewards/cargo_test_reward": 0.0,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9453125,
      "step": 153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1025.0,
      "completions/max_terminated_length": 1025.0,
      "completions/mean_length": 377.25,
      "completions/mean_terminated_length": 377.25,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "epoch": 0.06272912423625254,
      "grad_norm": 0.14368937929694342,
      "kl": 0.06414794921875,
      "learning_rate": 1e-06,
      "loss": -0.002,
      "num_tokens": 4169063.0,
      "reward": 4.8359375,
      "reward_std": 1.4678257703781128,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7734375,
      "step": 154
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 868.0,
      "completions/max_terminated_length": 868.0,
      "completions/mean_length": 387.0,
      "completions/mean_terminated_length": 399.48387096774195,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 178.0,
      "epoch": 0.06313645621181263,
      "grad_norm": 0.14528266911903864,
      "kl": 0.094482421875,
      "learning_rate": 1e-06,
      "loss": -0.0087,
      "num_tokens": 4200145.0,
      "reward": 4.3359375,
      "reward_std": 0.9601633548736572,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.03125,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.6484375,
      "step": 155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 662.0,
      "completions/max_terminated_length": 662.0,
      "completions/mean_length": 332.25,
      "completions/mean_terminated_length": 342.96774193548384,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 171.0,
      "epoch": 0.0635437881873727,
      "grad_norm": 0.12773570890855301,
      "kl": 0.0762939453125,
      "learning_rate": 1e-06,
      "loss": -0.0061,
      "num_tokens": 4229192.0,
      "reward": 4.25,
      "reward_std": 1.2361493110656738,
      "rewards/cargo_build_reward": 0.5625,
      "rewards/cargo_clippy_reward": 0.5625,
      "rewards/cargo_test_reward": 0.21875,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.734375,
      "step": 156
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 826.0,
      "completions/max_terminated_length": 826.0,
      "completions/mean_length": 387.4375,
      "completions/mean_terminated_length": 387.4375,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "epoch": 0.06395112016293279,
      "grad_norm": 0.09874887518593596,
      "kl": 0.0712890625,
      "learning_rate": 1e-06,
      "loss": -0.0029,
      "num_tokens": 4257246.0,
      "reward": 4.3359375,
      "reward_std": 1.028090000152588,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.15625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7109375,
      "step": 157
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1465.0,
      "completions/max_terminated_length": 1465.0,
      "completions/mean_length": 542.46875,
      "completions/mean_terminated_length": 559.9677419354839,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 246.0,
      "epoch": 0.06435845213849287,
      "grad_norm": 0.127277549418077,
      "kl": 0.029144287109375,
      "learning_rate": 1e-06,
      "loss": -0.0043,
      "num_tokens": 4293910.0,
      "reward": 4.296875,
      "reward_std": 0.9780712127685547,
      "rewards/cargo_build_reward": 0.46875,
      "rewards/cargo_clippy_reward": 0.46875,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.65625,
      "step": 158
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 654.0,
      "completions/max_terminated_length": 654.0,
      "completions/mean_length": 374.5,
      "completions/mean_terminated_length": 374.5,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "epoch": 0.06476578411405295,
      "grad_norm": 0.13527228501810654,
      "kl": 0.06500244140625,
      "learning_rate": 1e-06,
      "loss": -0.0046,
      "num_tokens": 4321214.0,
      "reward": 4.6953125,
      "reward_std": 1.41321861743927,
      "rewards/cargo_build_reward": 0.6875,
      "rewards/cargo_clippy_reward": 0.6875,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.6328125,
      "step": 159
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 667.0,
      "completions/max_terminated_length": 667.0,
      "completions/mean_length": 414.46875,
      "completions/mean_terminated_length": 414.46875,
      "completions/min_length": 184.0,
      "completions/min_terminated_length": 184.0,
      "epoch": 0.06517311608961303,
      "grad_norm": 0.12072646535966278,
      "kl": 0.06292724609375,
      "learning_rate": 1e-06,
      "loss": -0.0072,
      "num_tokens": 4348813.0,
      "reward": 4.2890625,
      "reward_std": 1.082193374633789,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.0625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8515625,
      "step": 160
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 745.0,
      "completions/max_terminated_length": 745.0,
      "completions/mean_length": 469.96875,
      "completions/mean_terminated_length": 469.96875,
      "completions/min_length": 228.0,
      "completions/min_terminated_length": 228.0,
      "epoch": 0.06558044806517312,
      "grad_norm": 0.10543401512652517,
      "kl": 0.0916748046875,
      "learning_rate": 1e-06,
      "loss": -0.0042,
      "num_tokens": 4379212.0,
      "reward": 3.265625,
      "reward_std": 0.684702455997467,
      "rewards/cargo_build_reward": 0.15625,
      "rewards/cargo_clippy_reward": 0.15625,
      "rewards/cargo_test_reward": 0.0,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.953125,
      "step": 161
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 638.0,
      "completions/max_terminated_length": 638.0,
      "completions/mean_length": 277.59375,
      "completions/mean_terminated_length": 277.59375,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "epoch": 0.0659877800407332,
      "grad_norm": 0.11898224685444178,
      "kl": 0.0526123046875,
      "learning_rate": 1e-06,
      "loss": 0.0003,
      "num_tokens": 4403623.0,
      "reward": 5.1171875,
      "reward_std": 1.2387746572494507,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.5,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.4921875,
      "step": 162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 826.0,
      "completions/max_terminated_length": 826.0,
      "completions/mean_length": 333.78125,
      "completions/mean_terminated_length": 344.5483870967742,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 169.0,
      "epoch": 0.06639511201629328,
      "grad_norm": 0.13795453576396063,
      "kl": 0.272216796875,
      "learning_rate": 1e-06,
      "loss": -0.0219,
      "num_tokens": 4432845.0,
      "reward": 5.109375,
      "reward_std": 1.3525934219360352,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.796875,
      "step": 163
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 699.0,
      "completions/max_terminated_length": 699.0,
      "completions/mean_length": 398.25,
      "completions/mean_terminated_length": 398.25,
      "completions/min_length": 238.0,
      "completions/min_terminated_length": 238.0,
      "epoch": 0.06680244399185337,
      "grad_norm": 0.10619325377296591,
      "kl": 0.0416259765625,
      "learning_rate": 1e-06,
      "loss": 0.0005,
      "num_tokens": 4461021.0,
      "reward": 4.4609375,
      "reward_std": 0.8485924005508423,
      "rewards/cargo_build_reward": 0.78125,
      "rewards/cargo_clippy_reward": 0.78125,
      "rewards/cargo_test_reward": 0.0625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7734375,
      "step": 164
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 814.0,
      "completions/max_terminated_length": 814.0,
      "completions/mean_length": 336.375,
      "completions/mean_terminated_length": 336.375,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "epoch": 0.06720977596741344,
      "grad_norm": 0.12246407602729473,
      "kl": 0.34552001953125,
      "learning_rate": 1e-06,
      "loss": 0.0197,
      "num_tokens": 4486409.0,
      "reward": 5.3828125,
      "reward_std": 1.0585535764694214,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.4375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8828125,
      "step": 165
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 779.0,
      "completions/max_terminated_length": 779.0,
      "completions/mean_length": 220.0625,
      "completions/mean_terminated_length": 220.0625,
      "completions/min_length": 70.0,
      "completions/min_terminated_length": 70.0,
      "epoch": 0.06761710794297353,
      "grad_norm": 0.12973837432013488,
      "kl": 0.320556640625,
      "learning_rate": 1e-06,
      "loss": 0.0063,
      "num_tokens": 4507763.0,
      "reward": 5.7578125,
      "reward_std": 1.0621415376663208,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.65625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7578125,
      "step": 166
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 885.0,
      "completions/max_terminated_length": 885.0,
      "completions/mean_length": 356.28125,
      "completions/mean_terminated_length": 356.28125,
      "completions/min_length": 201.0,
      "completions/min_terminated_length": 201.0,
      "epoch": 0.0680244399185336,
      "grad_norm": 0.13690153487600087,
      "kl": 0.09442138671875,
      "learning_rate": 1e-06,
      "loss": 0.014,
      "num_tokens": 4534548.0,
      "reward": 4.3671875,
      "reward_std": 1.2160022258758545,
      "rewards/cargo_build_reward": 0.625,
      "rewards/cargo_clippy_reward": 0.625,
      "rewards/cargo_test_reward": 0.15625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8046875,
      "step": 167
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 995.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 515.25,
      "completions/mean_terminated_length": 531.8709677419355,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 203.0,
      "epoch": 0.06843177189409369,
      "grad_norm": 0.14113580127363495,
      "kl": 0.053314208984375,
      "learning_rate": 1e-06,
      "loss": -0.0049,
      "num_tokens": 4569828.0,
      "reward": 3.75,
      "reward_std": 1.1177539825439453,
      "rewards/cargo_build_reward": 0.4375,
      "rewards/cargo_clippy_reward": 0.4375,
      "rewards/cargo_test_reward": 0.03125,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.875,
      "step": 168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 739.0,
      "completions/max_terminated_length": 739.0,
      "completions/mean_length": 466.65625,
      "completions/mean_terminated_length": 466.65625,
      "completions/min_length": 262.0,
      "completions/min_terminated_length": 262.0,
      "epoch": 0.06883910386965376,
      "grad_norm": 0.14096201917283618,
      "kl": 0.081207275390625,
      "learning_rate": 1e-06,
      "loss": -0.0016,
      "num_tokens": 4599761.0,
      "reward": 4.1484375,
      "reward_std": 1.1017944812774658,
      "rewards/cargo_build_reward": 0.625,
      "rewards/cargo_clippy_reward": 0.625,
      "rewards/cargo_test_reward": 0.09375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.7265625,
      "step": 169
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 564.0,
      "completions/max_terminated_length": 564.0,
      "completions/mean_length": 326.875,
      "completions/mean_terminated_length": 326.875,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "epoch": 0.06924643584521385,
      "grad_norm": 0.12424183311593773,
      "kl": 0.0482177734375,
      "learning_rate": 1e-06,
      "loss": 0.0022,
      "num_tokens": 4624853.0,
      "reward": 4.9296875,
      "reward_std": 1.4418165683746338,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9296875,
      "step": 170
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 692.0,
      "completions/max_terminated_length": 692.0,
      "completions/mean_length": 405.59375,
      "completions/mean_terminated_length": 405.59375,
      "completions/min_length": 201.0,
      "completions/min_terminated_length": 201.0,
      "epoch": 0.06965376782077393,
      "grad_norm": 0.15246901793772144,
      "kl": 0.03509521484375,
      "learning_rate": 1e-06,
      "loss": 0.0032,
      "num_tokens": 4652536.0,
      "reward": 4.6953125,
      "reward_std": 1.1468729972839355,
      "rewards/cargo_build_reward": 0.5625,
      "rewards/cargo_clippy_reward": 0.5625,
      "rewards/cargo_test_reward": 0.3125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9453125,
      "step": 171
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 852.0,
      "completions/max_terminated_length": 852.0,
      "completions/mean_length": 393.15625,
      "completions/mean_terminated_length": 393.15625,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "epoch": 0.07006109979633401,
      "grad_norm": 0.10859157260107942,
      "kl": 0.07415771484375,
      "learning_rate": 1e-06,
      "loss": 0.0089,
      "num_tokens": 4680557.0,
      "reward": 4.890625,
      "reward_std": 1.0702264308929443,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.71875,
      "step": 172
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1665.0,
      "completions/max_terminated_length": 1665.0,
      "completions/mean_length": 468.5625,
      "completions/mean_terminated_length": 468.5625,
      "completions/min_length": 263.0,
      "completions/min_terminated_length": 263.0,
      "epoch": 0.07046843177189409,
      "grad_norm": 0.08929775595064933,
      "kl": 0.03656005859375,
      "learning_rate": 1e-06,
      "loss": -0.0019,
      "num_tokens": 4710519.0,
      "reward": 4.1640625,
      "reward_std": 0.8054457902908325,
      "rewards/cargo_build_reward": 0.625,
      "rewards/cargo_clippy_reward": 0.625,
      "rewards/cargo_test_reward": 0.125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.6640625,
      "step": 173
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 777.0,
      "completions/max_terminated_length": 777.0,
      "completions/mean_length": 294.8125,
      "completions/mean_terminated_length": 294.8125,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "epoch": 0.07087576374745418,
      "grad_norm": 0.10009815562182411,
      "kl": 0.04638671875,
      "learning_rate": 1e-06,
      "loss": -0.0036,
      "num_tokens": 4734401.0,
      "reward": 5.078125,
      "reward_std": 1.4493857622146606,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.59375,
      "step": 174
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 976.0,
      "completions/max_terminated_length": 976.0,
      "completions/mean_length": 511.3125,
      "completions/mean_terminated_length": 511.3125,
      "completions/min_length": 261.0,
      "completions/min_terminated_length": 261.0,
      "epoch": 0.07128309572301425,
      "grad_norm": 0.15151117326113023,
      "kl": 0.60736083984375,
      "learning_rate": 1e-06,
      "loss": 0.0156,
      "num_tokens": 4765819.0,
      "reward": 4.46875,
      "reward_std": 1.2704614400863647,
      "rewards/cargo_build_reward": 0.75,
      "rewards/cargo_clippy_reward": 0.75,
      "rewards/cargo_test_reward": 0.1875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.59375,
      "step": 175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 661.0,
      "completions/max_terminated_length": 661.0,
      "completions/mean_length": 357.28125,
      "completions/mean_terminated_length": 357.28125,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "epoch": 0.07169042769857434,
      "grad_norm": 0.07947011852154963,
      "kl": 0.04974365234375,
      "learning_rate": 1e-06,
      "loss": 0.0022,
      "num_tokens": 4791852.0,
      "reward": 4.640625,
      "reward_std": 0.7895081043243408,
      "rewards/cargo_build_reward": 0.75,
      "rewards/cargo_clippy_reward": 0.75,
      "rewards/cargo_test_reward": 0.1875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.765625,
      "step": 176
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 975.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 512.96875,
      "completions/mean_terminated_length": 512.96875,
      "completions/min_length": 293.0,
      "completions/min_terminated_length": 293.0,
      "epoch": 0.07209775967413443,
      "grad_norm": 0.10662422981590515,
      "kl": 0.0921630859375,
      "learning_rate": 1e-06,
      "loss": -0.0095,
      "num_tokens": 4823419.0,
      "reward": 3.4375,
      "reward_std": 0.9471868276596069,
      "rewards/cargo_build_reward": 0.3125,
      "rewards/cargo_clippy_reward": 0.3125,
      "rewards/cargo_test_reward": 0.0,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8125,
      "step": 177
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 491.0,
      "completions/max_terminated_length": 491.0,
      "completions/mean_length": 299.03125,
      "completions/mean_terminated_length": 299.03125,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "epoch": 0.0725050916496945,
      "grad_norm": 0.08467705969064211,
      "kl": 0.04705810546875,
      "learning_rate": 1e-06,
      "loss": 0.0039,
      "num_tokens": 4847428.0,
      "reward": 5.0625,
      "reward_std": 0.700248122215271,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.25,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9375,
      "step": 178
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 631.0,
      "completions/max_terminated_length": 631.0,
      "completions/mean_length": 330.0,
      "completions/mean_terminated_length": 330.0,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "epoch": 0.07291242362525459,
      "grad_norm": 0.1323091423700111,
      "kl": 0.1751708984375,
      "learning_rate": 1e-06,
      "loss": 0.0062,
      "num_tokens": 4872932.0,
      "reward": 4.7890625,
      "reward_std": 1.2005279064178467,
      "rewards/cargo_build_reward": 0.71875,
      "rewards/cargo_clippy_reward": 0.71875,
      "rewards/cargo_test_reward": 0.25,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8515625,
      "step": 179
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 760.0,
      "completions/max_terminated_length": 760.0,
      "completions/mean_length": 381.96875,
      "completions/mean_terminated_length": 381.96875,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "epoch": 0.07331975560081466,
      "grad_norm": 0.12808650426853077,
      "kl": 0.0498046875,
      "learning_rate": 1e-06,
      "loss": 0.0063,
      "num_tokens": 4900195.0,
      "reward": 4.8046875,
      "reward_std": 0.9538981914520264,
      "rewards/cargo_build_reward": 0.625,
      "rewards/cargo_clippy_reward": 0.625,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8046875,
      "step": 180
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 662.0,
      "completions/max_terminated_length": 662.0,
      "completions/mean_length": 304.84375,
      "completions/mean_terminated_length": 304.84375,
      "completions/min_length": 90.0,
      "completions/min_terminated_length": 90.0,
      "epoch": 0.07372708757637475,
      "grad_norm": 0.10331216337152113,
      "kl": 0.07073974609375,
      "learning_rate": 1e-06,
      "loss": 0.0089,
      "num_tokens": 4925254.0,
      "reward": 4.8984375,
      "reward_std": 0.8840129375457764,
      "rewards/cargo_build_reward": 0.78125,
      "rewards/cargo_clippy_reward": 0.78125,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.6640625,
      "step": 181
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 450.0,
      "completions/max_terminated_length": 450.0,
      "completions/mean_length": 273.34375,
      "completions/mean_terminated_length": 273.34375,
      "completions/min_length": 184.0,
      "completions/min_terminated_length": 184.0,
      "epoch": 0.07413441955193482,
      "grad_norm": 0.12623768958158862,
      "kl": 0.0665283203125,
      "learning_rate": 1e-06,
      "loss": 0.0015,
      "num_tokens": 4948985.0,
      "reward": 4.84375,
      "reward_std": 1.2009623050689697,
      "rewards/cargo_build_reward": 0.78125,
      "rewards/cargo_clippy_reward": 0.78125,
      "rewards/cargo_test_reward": 0.1875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.90625,
      "step": 182
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1525.0,
      "completions/max_terminated_length": 1525.0,
      "completions/mean_length": 422.0,
      "completions/mean_terminated_length": 422.0,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "epoch": 0.07454175152749491,
      "grad_norm": 0.12579157587186837,
      "kl": 0.046142578125,
      "learning_rate": 1e-06,
      "loss": 0.0087,
      "num_tokens": 4977897.0,
      "reward": 4.859375,
      "reward_std": 1.0070652961730957,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.25,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.71875,
      "step": 183
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 799.0,
      "completions/max_terminated_length": 799.0,
      "completions/mean_length": 426.53125,
      "completions/mean_terminated_length": 426.53125,
      "completions/min_length": 181.0,
      "completions/min_terminated_length": 181.0,
      "epoch": 0.07494908350305499,
      "grad_norm": 0.13752570104041778,
      "kl": 0.041015625,
      "learning_rate": 1e-06,
      "loss": -0.002,
      "num_tokens": 5007234.0,
      "reward": 4.1328125,
      "reward_std": 1.2121552228927612,
      "rewards/cargo_build_reward": 0.46875,
      "rewards/cargo_clippy_reward": 0.46875,
      "rewards/cargo_test_reward": 0.25,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.6953125,
      "step": 184
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 765.0,
      "completions/max_terminated_length": 765.0,
      "completions/mean_length": 434.125,
      "completions/mean_terminated_length": 434.125,
      "completions/min_length": 194.0,
      "completions/min_terminated_length": 194.0,
      "epoch": 0.07535641547861507,
      "grad_norm": 0.11594259359655909,
      "kl": 0.06817626953125,
      "learning_rate": 1e-06,
      "loss": 0.0086,
      "num_tokens": 5036142.0,
      "reward": 4.8203125,
      "reward_std": 0.9454158544540405,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.25,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.6796875,
      "step": 185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 549.0,
      "completions/max_terminated_length": 549.0,
      "completions/mean_length": 298.6875,
      "completions/mean_terminated_length": 298.6875,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "epoch": 0.07576374745417515,
      "grad_norm": 0.1308820961805529,
      "kl": 0.05621337890625,
      "learning_rate": 1e-06,
      "loss": 0.0082,
      "num_tokens": 5060300.0,
      "reward": 4.8671875,
      "reward_std": 1.0690581798553467,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.21875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.6796875,
      "step": 186
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 964.0,
      "completions/max_terminated_length": 964.0,
      "completions/mean_length": 379.40625,
      "completions/mean_terminated_length": 379.40625,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "epoch": 0.07617107942973524,
      "grad_norm": 0.10645042668270711,
      "kl": 0.10748291015625,
      "learning_rate": 1e-06,
      "loss": -0.0005,
      "num_tokens": 5087345.0,
      "reward": 5.0078125,
      "reward_std": 1.0140846967697144,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.3125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7265625,
      "step": 187
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 665.0,
      "completions/max_terminated_length": 665.0,
      "completions/mean_length": 374.34375,
      "completions/mean_terminated_length": 374.34375,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "epoch": 0.07657841140529531,
      "grad_norm": 0.15089256688296765,
      "kl": 0.9410400390625,
      "learning_rate": 1e-06,
      "loss": -0.0055,
      "num_tokens": 5115100.0,
      "reward": 4.4921875,
      "reward_std": 1.379470705986023,
      "rewards/cargo_build_reward": 0.625,
      "rewards/cargo_clippy_reward": 0.625,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.6796875,
      "step": 188
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 579.0,
      "completions/max_terminated_length": 579.0,
      "completions/mean_length": 365.0,
      "completions/mean_terminated_length": 365.0,
      "completions/min_length": 206.0,
      "completions/min_terminated_length": 206.0,
      "epoch": 0.0769857433808554,
      "grad_norm": 0.1569538088020158,
      "kl": 0.1473388671875,
      "learning_rate": 1e-06,
      "loss": 0.0016,
      "num_tokens": 5142796.0,
      "reward": 4.1015625,
      "reward_std": 1.5070769786834717,
      "rewards/cargo_build_reward": 0.5625,
      "rewards/cargo_clippy_reward": 0.5625,
      "rewards/cargo_test_reward": 0.25,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.4921875,
      "step": 189
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 770.0,
      "completions/max_terminated_length": 770.0,
      "completions/mean_length": 363.0,
      "completions/mean_terminated_length": 363.0,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "epoch": 0.07739307535641547,
      "grad_norm": 0.05935078934065327,
      "kl": 0.0694580078125,
      "learning_rate": 1e-06,
      "loss": 0.0024,
      "num_tokens": 5169356.0,
      "reward": 4.4375,
      "reward_std": 0.42242497205734253,
      "rewards/cargo_build_reward": 0.5,
      "rewards/cargo_clippy_reward": 0.5,
      "rewards/cargo_test_reward": 0.25,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9375,
      "step": 190
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 559.0,
      "completions/max_terminated_length": 559.0,
      "completions/mean_length": 298.71875,
      "completions/mean_terminated_length": 298.71875,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "epoch": 0.07780040733197556,
      "grad_norm": 0.12545164979791618,
      "kl": 0.08349609375,
      "learning_rate": 1e-06,
      "loss": -0.0034,
      "num_tokens": 5192971.0,
      "reward": 4.484375,
      "reward_std": 1.0269687175750732,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.15625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.859375,
      "step": 191
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1053.0,
      "completions/max_terminated_length": 1053.0,
      "completions/mean_length": 457.71875,
      "completions/mean_terminated_length": 457.71875,
      "completions/min_length": 238.0,
      "completions/min_terminated_length": 238.0,
      "epoch": 0.07820773930753565,
      "grad_norm": 0.14005548709107163,
      "kl": 0.04852294921875,
      "learning_rate": 1e-06,
      "loss": -0.0008,
      "num_tokens": 5222666.0,
      "reward": 4.3046875,
      "reward_std": 1.2621102333068848,
      "rewards/cargo_build_reward": 0.625,
      "rewards/cargo_clippy_reward": 0.625,
      "rewards/cargo_test_reward": 0.15625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7421875,
      "step": 192
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 506.0,
      "completions/max_terminated_length": 506.0,
      "completions/mean_length": 286.3125,
      "completions/mean_terminated_length": 286.3125,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "epoch": 0.07861507128309572,
      "grad_norm": 0.1185698427295713,
      "kl": 0.12652587890625,
      "learning_rate": 1e-06,
      "loss": -0.0057,
      "num_tokens": 5246404.0,
      "reward": 5.3828125,
      "reward_std": 1.0377273559570312,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.5625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.5078125,
      "step": 193
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3218.0,
      "completions/max_terminated_length": 3218.0,
      "completions/mean_length": 491.1875,
      "completions/mean_terminated_length": 491.1875,
      "completions/min_length": 184.0,
      "completions/min_terminated_length": 184.0,
      "epoch": 0.07902240325865581,
      "grad_norm": 0.13419747893981798,
      "kl": 0.167236328125,
      "learning_rate": 1e-06,
      "loss": 0.0038,
      "num_tokens": 5277970.0,
      "reward": 4.6171875,
      "reward_std": 0.9943197965621948,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.625,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7734375,
      "step": 194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 975.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 392.84375,
      "completions/mean_terminated_length": 392.84375,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "epoch": 0.07942973523421588,
      "grad_norm": 0.1553068882473102,
      "kl": 1.0399169921875,
      "learning_rate": 1e-06,
      "loss": -0.0009,
      "num_tokens": 5306125.0,
      "reward": 4.6171875,
      "reward_std": 1.373223066329956,
      "rewards/cargo_build_reward": 0.6875,
      "rewards/cargo_clippy_reward": 0.6875,
      "rewards/cargo_test_reward": 0.3125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.6171875,
      "step": 195
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 932.0,
      "completions/max_terminated_length": 932.0,
      "completions/mean_length": 328.75,
      "completions/mean_terminated_length": 328.75,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "epoch": 0.07983706720977597,
      "grad_norm": 0.09792381388479776,
      "kl": 0.07232666015625,
      "learning_rate": 1e-06,
      "loss": -0.0034,
      "num_tokens": 5331549.0,
      "reward": 5.34375,
      "reward_std": 0.6353054046630859,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.40625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8125,
      "step": 196
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 727.0,
      "completions/max_terminated_length": 727.0,
      "completions/mean_length": 387.15625,
      "completions/mean_terminated_length": 387.15625,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "epoch": 0.08024439918533605,
      "grad_norm": 0.1475298153624464,
      "kl": 0.15081787109375,
      "learning_rate": 1e-06,
      "loss": 0.0002,
      "num_tokens": 5358842.0,
      "reward": 4.359375,
      "reward_std": 1.2290772199630737,
      "rewards/cargo_build_reward": 0.59375,
      "rewards/cargo_clippy_reward": 0.59375,
      "rewards/cargo_test_reward": 0.1875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.796875,
      "step": 197
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 557.0,
      "completions/max_terminated_length": 557.0,
      "completions/mean_length": 314.46875,
      "completions/mean_terminated_length": 314.46875,
      "completions/min_length": 208.0,
      "completions/min_terminated_length": 208.0,
      "epoch": 0.08065173116089613,
      "grad_norm": 0.10582358722949149,
      "kl": 0.053466796875,
      "learning_rate": 1e-06,
      "loss": 0.0012,
      "num_tokens": 5384809.0,
      "reward": 5.765625,
      "reward_std": 1.0285557508468628,
      "rewards/cargo_build_reward": 0.9375,
      "rewards/cargo_clippy_reward": 0.9375,
      "rewards/cargo_test_reward": 0.59375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.703125,
      "step": 198
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 659.0,
      "completions/max_terminated_length": 659.0,
      "completions/mean_length": 328.90625,
      "completions/mean_terminated_length": 328.90625,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "epoch": 0.08105906313645621,
      "grad_norm": 0.15206191338594935,
      "kl": 0.2681884765625,
      "learning_rate": 1e-06,
      "loss": -0.0016,
      "num_tokens": 5411046.0,
      "reward": 5.0625,
      "reward_std": 0.9651427268981934,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.46875,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.859375,
      "step": 199
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2047.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 358.4375,
      "completions/mean_terminated_length": 358.4375,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "epoch": 0.0814663951120163,
      "grad_norm": 0.1075376922189615,
      "kl": 0.386962890625,
      "learning_rate": 1e-06,
      "loss": 0.0125,
      "num_tokens": 5437500.0,
      "reward": 6.1171875,
      "reward_std": 1.1414694786071777,
      "rewards/cargo_build_reward": 0.96875,
      "rewards/cargo_clippy_reward": 0.96875,
      "rewards/cargo_test_reward": 0.625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9296875,
      "step": 200
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 745.0,
      "completions/max_terminated_length": 745.0,
      "completions/mean_length": 349.46875,
      "completions/mean_terminated_length": 349.46875,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "epoch": 0.08187372708757637,
      "grad_norm": 0.07006979994831898,
      "kl": 0.0355072021484375,
      "learning_rate": 1e-06,
      "loss": -0.0013,
      "num_tokens": 5463139.0,
      "reward": 4.484375,
      "reward_std": 0.5211516618728638,
      "rewards/cargo_build_reward": 0.78125,
      "rewards/cargo_clippy_reward": 0.78125,
      "rewards/cargo_test_reward": 0.0625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.796875,
      "step": 201
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1048.0,
      "completions/max_terminated_length": 1048.0,
      "completions/mean_length": 343.75,
      "completions/mean_terminated_length": 343.75,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "epoch": 0.08228105906313646,
      "grad_norm": 0.13950721322854165,
      "kl": 0.053985595703125,
      "learning_rate": 1e-06,
      "loss": 0.0046,
      "num_tokens": 5489619.0,
      "reward": 4.4765625,
      "reward_std": 1.311608910560608,
      "rewards/cargo_build_reward": 0.625,
      "rewards/cargo_clippy_reward": 0.625,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.5390625,
      "step": 202
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1506.0,
      "completions/max_terminated_length": 1506.0,
      "completions/mean_length": 564.96875,
      "completions/mean_terminated_length": 564.96875,
      "completions/min_length": 227.0,
      "completions/min_terminated_length": 227.0,
      "epoch": 0.08268839103869653,
      "grad_norm": 0.13863625665109122,
      "kl": 0.014312744140625,
      "learning_rate": 1e-06,
      "loss": 0.0034,
      "num_tokens": 5523226.0,
      "reward": 3.9453125,
      "reward_std": 1.123884916305542,
      "rewards/cargo_build_reward": 0.59375,
      "rewards/cargo_clippy_reward": 0.59375,
      "rewards/cargo_test_reward": 0.1875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.3828125,
      "step": 203
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1253.0,
      "completions/max_terminated_length": 1253.0,
      "completions/mean_length": 361.25,
      "completions/mean_terminated_length": 361.25,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "epoch": 0.08309572301425662,
      "grad_norm": 0.0941997688772263,
      "kl": 0.0204620361328125,
      "learning_rate": 1e-06,
      "loss": -0.0088,
      "num_tokens": 5549954.0,
      "reward": 5.8125,
      "reward_std": 0.9149646759033203,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.59375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.875,
      "step": 204
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 800.0,
      "completions/max_terminated_length": 800.0,
      "completions/mean_length": 381.34375,
      "completions/mean_terminated_length": 381.34375,
      "completions/min_length": 186.0,
      "completions/min_terminated_length": 186.0,
      "epoch": 0.0835030549898167,
      "grad_norm": 0.12969089104595022,
      "kl": 0.10968017578125,
      "learning_rate": 1e-06,
      "loss": -0.0017,
      "num_tokens": 5577925.0,
      "reward": 4.6875,
      "reward_std": 1.2291152477264404,
      "rewards/cargo_build_reward": 0.6875,
      "rewards/cargo_clippy_reward": 0.6875,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.75,
      "step": 205
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 905.0,
      "completions/max_terminated_length": 905.0,
      "completions/mean_length": 369.09375,
      "completions/mean_terminated_length": 369.09375,
      "completions/min_length": 220.0,
      "completions/min_terminated_length": 220.0,
      "epoch": 0.08391038696537678,
      "grad_norm": 0.12012906410037177,
      "kl": 0.021697998046875,
      "learning_rate": 1e-06,
      "loss": 0.0019,
      "num_tokens": 5604616.0,
      "reward": 4.9765625,
      "reward_std": 1.0764740705490112,
      "rewards/cargo_build_reward": 0.71875,
      "rewards/cargo_clippy_reward": 0.71875,
      "rewards/cargo_test_reward": 0.40625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7265625,
      "step": 206
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 874.0,
      "completions/max_terminated_length": 874.0,
      "completions/mean_length": 411.78125,
      "completions/mean_terminated_length": 411.78125,
      "completions/min_length": 221.0,
      "completions/min_terminated_length": 221.0,
      "epoch": 0.08431771894093687,
      "grad_norm": 0.12298712714257735,
      "kl": 0.051971435546875,
      "learning_rate": 1e-06,
      "loss": 0.0013,
      "num_tokens": 5632393.0,
      "reward": 4.21875,
      "reward_std": 1.062387228012085,
      "rewards/cargo_build_reward": 0.6875,
      "rewards/cargo_clippy_reward": 0.6875,
      "rewards/cargo_test_reward": 0.03125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.78125,
      "step": 207
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 445.0,
      "completions/max_terminated_length": 445.0,
      "completions/mean_length": 280.75,
      "completions/mean_terminated_length": 280.75,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "epoch": 0.08472505091649694,
      "grad_norm": 0.09282797754422388,
      "kl": 0.34716796875,
      "learning_rate": 1e-06,
      "loss": 0.0064,
      "num_tokens": 5656425.0,
      "reward": 5.234375,
      "reward_std": 1.1892833709716797,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.4375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.734375,
      "step": 208
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 598.0,
      "completions/max_terminated_length": 598.0,
      "completions/mean_length": 313.3125,
      "completions/mean_terminated_length": 323.4193548387097,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 195.0,
      "epoch": 0.08513238289205703,
      "grad_norm": 0.13863374958378186,
      "kl": 0.033966064453125,
      "learning_rate": 1e-06,
      "loss": 0.0006,
      "num_tokens": 5685387.0,
      "reward": 4.7734375,
      "reward_std": 1.572913408279419,
      "rewards/cargo_build_reward": 0.78125,
      "rewards/cargo_clippy_reward": 0.75,
      "rewards/cargo_test_reward": 0.25,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.8046875,
      "step": 209
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 569.0,
      "completions/max_terminated_length": 569.0,
      "completions/mean_length": 322.625,
      "completions/mean_terminated_length": 322.625,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "epoch": 0.0855397148676171,
      "grad_norm": 0.16979858976128354,
      "kl": 0.07171630859375,
      "learning_rate": 1e-06,
      "loss": 0.0089,
      "num_tokens": 5710951.0,
      "reward": 4.671875,
      "reward_std": 1.4163511991500854,
      "rewards/cargo_build_reward": 0.75,
      "rewards/cargo_clippy_reward": 0.75,
      "rewards/cargo_test_reward": 0.1875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.796875,
      "step": 210
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 627.0,
      "completions/max_terminated_length": 627.0,
      "completions/mean_length": 357.3125,
      "completions/mean_terminated_length": 357.3125,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "epoch": 0.0859470468431772,
      "grad_norm": 0.13105401371206019,
      "kl": 0.0201873779296875,
      "learning_rate": 1e-06,
      "loss": -0.0027,
      "num_tokens": 5737321.0,
      "reward": 5.5234375,
      "reward_std": 1.11106276512146,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.40625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9609375,
      "step": 211
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 758.0,
      "completions/max_terminated_length": 758.0,
      "completions/mean_length": 346.1875,
      "completions/mean_terminated_length": 346.1875,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "epoch": 0.08635437881873727,
      "grad_norm": 0.0925506339214318,
      "kl": 0.02349853515625,
      "learning_rate": 1e-06,
      "loss": -0.0002,
      "num_tokens": 5763687.0,
      "reward": 5.15625,
      "reward_std": 0.7716832756996155,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.65625,
      "step": 212
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 903.0,
      "completions/max_terminated_length": 903.0,
      "completions/mean_length": 413.4375,
      "completions/mean_terminated_length": 413.4375,
      "completions/min_length": 210.0,
      "completions/min_terminated_length": 210.0,
      "epoch": 0.08676171079429736,
      "grad_norm": 0.10279410328832947,
      "kl": 0.021636962890625,
      "learning_rate": 1e-06,
      "loss": 0.0003,
      "num_tokens": 5791749.0,
      "reward": 4.75,
      "reward_std": 0.8146636486053467,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.75,
      "step": 213
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 887.0,
      "completions/max_terminated_length": 887.0,
      "completions/mean_length": 449.03125,
      "completions/mean_terminated_length": 449.03125,
      "completions/min_length": 232.0,
      "completions/min_terminated_length": 232.0,
      "epoch": 0.08716904276985743,
      "grad_norm": 0.11318549135026058,
      "kl": 0.0506591796875,
      "learning_rate": 1e-06,
      "loss": -0.0016,
      "num_tokens": 5821478.0,
      "reward": 4.296875,
      "reward_std": 0.9958111643791199,
      "rewards/cargo_build_reward": 0.71875,
      "rewards/cargo_clippy_reward": 0.71875,
      "rewards/cargo_test_reward": 0.03125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.796875,
      "step": 214
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 681.0,
      "completions/max_terminated_length": 681.0,
      "completions/mean_length": 356.09375,
      "completions/mean_terminated_length": 356.09375,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "epoch": 0.08757637474541752,
      "grad_norm": 0.11381759941293987,
      "kl": 0.024627685546875,
      "learning_rate": 1e-06,
      "loss": 0.0076,
      "num_tokens": 5847425.0,
      "reward": 4.921875,
      "reward_std": 1.115452527999878,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.1875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.921875,
      "step": 215
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 616.0,
      "completions/max_terminated_length": 616.0,
      "completions/mean_length": 356.8125,
      "completions/mean_terminated_length": 356.8125,
      "completions/min_length": 186.0,
      "completions/min_terminated_length": 186.0,
      "epoch": 0.08798370672097759,
      "grad_norm": 0.11961343666388408,
      "kl": 0.028656005859375,
      "learning_rate": 1e-06,
      "loss": 0.0026,
      "num_tokens": 5874387.0,
      "reward": 5.171875,
      "reward_std": 1.093719482421875,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.84375,
      "step": 216
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 796.0,
      "completions/max_terminated_length": 796.0,
      "completions/mean_length": 389.5625,
      "completions/mean_terminated_length": 389.5625,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "epoch": 0.08839103869653768,
      "grad_norm": 0.18720520794622916,
      "kl": 0.021575927734375,
      "learning_rate": 1e-06,
      "loss": 0.0061,
      "num_tokens": 5901989.0,
      "reward": 4.3125,
      "reward_std": 1.3922038078308105,
      "rewards/cargo_build_reward": 0.625,
      "rewards/cargo_clippy_reward": 0.625,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.5625,
      "step": 217
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1038.0,
      "completions/max_terminated_length": 1038.0,
      "completions/mean_length": 456.875,
      "completions/mean_terminated_length": 456.875,
      "completions/min_length": 194.0,
      "completions/min_terminated_length": 194.0,
      "epoch": 0.08879837067209775,
      "grad_norm": 0.09293499024827523,
      "kl": 0.038543701171875,
      "learning_rate": 1e-06,
      "loss": -0.0013,
      "num_tokens": 5931257.0,
      "reward": 4.484375,
      "reward_std": 0.7560322880744934,
      "rewards/cargo_build_reward": 0.53125,
      "rewards/cargo_clippy_reward": 0.53125,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.671875,
      "step": 218
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 713.0,
      "completions/max_terminated_length": 713.0,
      "completions/mean_length": 425.125,
      "completions/mean_terminated_length": 425.125,
      "completions/min_length": 200.0,
      "completions/min_terminated_length": 200.0,
      "epoch": 0.08920570264765784,
      "grad_norm": 0.16333880577412674,
      "kl": 0.03631591796875,
      "learning_rate": 1e-06,
      "loss": 0.0145,
      "num_tokens": 5959469.0,
      "reward": 4.609375,
      "reward_std": 1.1915391683578491,
      "rewards/cargo_build_reward": 0.6875,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.1875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.890625,
      "step": 219
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 628.0,
      "completions/max_terminated_length": 628.0,
      "completions/mean_length": 334.53125,
      "completions/mean_terminated_length": 334.53125,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "epoch": 0.08961303462321792,
      "grad_norm": 0.13580664034432854,
      "kl": 0.13531494140625,
      "learning_rate": 1e-06,
      "loss": -0.0021,
      "num_tokens": 5985382.0,
      "reward": 4.8203125,
      "reward_std": 1.308077096939087,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.3125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.5703125,
      "step": 220
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 966.0,
      "completions/max_terminated_length": 966.0,
      "completions/mean_length": 423.8125,
      "completions/mean_terminated_length": 423.8125,
      "completions/min_length": 194.0,
      "completions/min_terminated_length": 194.0,
      "epoch": 0.090020366598778,
      "grad_norm": 0.18168422733726539,
      "kl": 0.136383056640625,
      "learning_rate": 1e-06,
      "loss": 0.0086,
      "num_tokens": 6014240.0,
      "reward": 3.90625,
      "reward_std": 1.5534679889678955,
      "rewards/cargo_build_reward": 0.40625,
      "rewards/cargo_clippy_reward": 0.40625,
      "rewards/cargo_test_reward": 0.25,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.59375,
      "step": 221
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 667.0,
      "completions/max_terminated_length": 667.0,
      "completions/mean_length": 322.6875,
      "completions/mean_terminated_length": 322.6875,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "epoch": 0.09042769857433809,
      "grad_norm": 0.11897470122290314,
      "kl": 0.063751220703125,
      "learning_rate": 1e-06,
      "loss": 0.0039,
      "num_tokens": 6039182.0,
      "reward": 5.609375,
      "reward_std": 1.1206417083740234,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.5,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.859375,
      "step": 222
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 541.0,
      "completions/max_terminated_length": 541.0,
      "completions/mean_length": 341.5625,
      "completions/mean_terminated_length": 352.5806451612903,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 220.0,
      "epoch": 0.09083503054989817,
      "grad_norm": 0.14033574758499026,
      "kl": 0.049041748046875,
      "learning_rate": 1e-06,
      "loss": -0.0134,
      "num_tokens": 6068859.0,
      "reward": 4.8515625,
      "reward_std": 1.2350282669067383,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.625,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.8828125,
      "step": 223
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 925.0,
      "completions/max_terminated_length": 925.0,
      "completions/mean_length": 455.90625,
      "completions/mean_terminated_length": 455.90625,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "epoch": 0.09124236252545825,
      "grad_norm": 0.14672812399725055,
      "kl": 0.032989501953125,
      "learning_rate": 1e-06,
      "loss": 0.0102,
      "num_tokens": 6097864.0,
      "reward": 4.78125,
      "reward_std": 0.9682382941246033,
      "rewards/cargo_build_reward": 0.71875,
      "rewards/cargo_clippy_reward": 0.71875,
      "rewards/cargo_test_reward": 0.3125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.71875,
      "step": 224
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 542.0,
      "completions/max_terminated_length": 542.0,
      "completions/mean_length": 290.40625,
      "completions/mean_terminated_length": 290.40625,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "epoch": 0.09164969450101833,
      "grad_norm": 0.10355516835859621,
      "kl": 0.031402587890625,
      "learning_rate": 1e-06,
      "loss": 0.0036,
      "num_tokens": 6122701.0,
      "reward": 5.3515625,
      "reward_std": 1.052793264389038,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.46875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7890625,
      "step": 225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 851.0,
      "completions/max_terminated_length": 851.0,
      "completions/mean_length": 345.96875,
      "completions/mean_terminated_length": 357.1290322580645,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 174.0,
      "epoch": 0.09205702647657842,
      "grad_norm": 0.18047877874778687,
      "kl": 0.073516845703125,
      "learning_rate": 1e-06,
      "loss": -0.0048,
      "num_tokens": 6152920.0,
      "reward": 4.3203125,
      "reward_std": 1.6043800115585327,
      "rewards/cargo_build_reward": 0.53125,
      "rewards/cargo_clippy_reward": 0.53125,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.6328125,
      "step": 226
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 635.0,
      "completions/max_terminated_length": 635.0,
      "completions/mean_length": 326.96875,
      "completions/mean_terminated_length": 326.96875,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "epoch": 0.09246435845213849,
      "grad_norm": 0.13959796301881863,
      "kl": 0.035247802734375,
      "learning_rate": 1e-06,
      "loss": 0.0054,
      "num_tokens": 6178071.0,
      "reward": 5.046875,
      "reward_std": 1.2643322944641113,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.21875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.984375,
      "step": 227
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 783.0,
      "completions/max_terminated_length": 783.0,
      "completions/mean_length": 391.0625,
      "completions/mean_terminated_length": 391.0625,
      "completions/min_length": 224.0,
      "completions/min_terminated_length": 224.0,
      "epoch": 0.09287169042769858,
      "grad_norm": 0.15727370379536706,
      "kl": 0.157135009765625,
      "learning_rate": 1e-06,
      "loss": 0.0027,
      "num_tokens": 6205689.0,
      "reward": 4.8515625,
      "reward_std": 1.229414939880371,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.3125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.6015625,
      "step": 228
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 876.0,
      "completions/max_terminated_length": 876.0,
      "completions/mean_length": 369.34375,
      "completions/mean_terminated_length": 369.34375,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "epoch": 0.09327902240325865,
      "grad_norm": 0.09634590946427983,
      "kl": 0.025482177734375,
      "learning_rate": 1e-06,
      "loss": 0.0056,
      "num_tokens": 6232316.0,
      "reward": 5.6953125,
      "reward_std": 0.7354626059532166,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.5,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9453125,
      "step": 229
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 387.0,
      "completions/max_terminated_length": 387.0,
      "completions/mean_length": 264.03125,
      "completions/mean_terminated_length": 264.03125,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "epoch": 0.09368635437881874,
      "grad_norm": 0.06070469187112743,
      "kl": 0.042938232421875,
      "learning_rate": 1e-06,
      "loss": 0.0004,
      "num_tokens": 6255941.0,
      "reward": 5.4296875,
      "reward_std": 0.6073668003082275,
      "rewards/cargo_build_reward": 0.96875,
      "rewards/cargo_clippy_reward": 0.96875,
      "rewards/cargo_test_reward": 0.3125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8671875,
      "step": 230
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 638.0,
      "completions/max_terminated_length": 638.0,
      "completions/mean_length": 271.09375,
      "completions/mean_terminated_length": 271.09375,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "epoch": 0.09409368635437881,
      "grad_norm": 0.11551822096300121,
      "kl": 0.043609619140625,
      "learning_rate": 1e-06,
      "loss": 0.0008,
      "num_tokens": 6279656.0,
      "reward": 5.6015625,
      "reward_std": 1.2009453773498535,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.5625,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7109375,
      "step": 231
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 443.0,
      "completions/max_terminated_length": 443.0,
      "completions/mean_length": 261.375,
      "completions/mean_terminated_length": 261.375,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "epoch": 0.0945010183299389,
      "grad_norm": 0.09422765503292208,
      "kl": 0.05816650390625,
      "learning_rate": 1e-06,
      "loss": -0.0013,
      "num_tokens": 6303220.0,
      "reward": 5.96875,
      "reward_std": 1.0341272354125977,
      "rewards/cargo_build_reward": 0.96875,
      "rewards/cargo_clippy_reward": 0.96875,
      "rewards/cargo_test_reward": 0.65625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.71875,
      "step": 232
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 579.0,
      "completions/max_terminated_length": 579.0,
      "completions/mean_length": 376.0625,
      "completions/mean_terminated_length": 376.0625,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "epoch": 0.09490835030549898,
      "grad_norm": 0.30940106585791494,
      "kl": 3.4390869140625,
      "learning_rate": 1e-06,
      "loss": 0.0067,
      "num_tokens": 6330718.0,
      "reward": 4.4453125,
      "reward_std": 0.7659948468208313,
      "rewards/cargo_build_reward": 0.75,
      "rewards/cargo_clippy_reward": 0.75,
      "rewards/cargo_test_reward": 0.0625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8203125,
      "step": 233
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 638.0,
      "completions/max_terminated_length": 638.0,
      "completions/mean_length": 322.09375,
      "completions/mean_terminated_length": 322.09375,
      "completions/min_length": 189.0,
      "completions/min_terminated_length": 189.0,
      "epoch": 0.09531568228105906,
      "grad_norm": 0.11744059123433084,
      "kl": 0.3782958984375,
      "learning_rate": 1e-06,
      "loss": 0.0034,
      "num_tokens": 6354857.0,
      "reward": 5.2734375,
      "reward_std": 0.7889077663421631,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8984375,
      "step": 234
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 826.0,
      "completions/max_terminated_length": 826.0,
      "completions/mean_length": 376.5625,
      "completions/mean_terminated_length": 376.5625,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "epoch": 0.09572301425661914,
      "grad_norm": 0.13686359963866218,
      "kl": 0.032012939453125,
      "learning_rate": 1e-06,
      "loss": 0.019,
      "num_tokens": 6381963.0,
      "reward": 4.96875,
      "reward_std": 1.042311668395996,
      "rewards/cargo_build_reward": 0.75,
      "rewards/cargo_clippy_reward": 0.75,
      "rewards/cargo_test_reward": 0.40625,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.703125,
      "step": 235
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 542.0,
      "completions/max_terminated_length": 542.0,
      "completions/mean_length": 323.46875,
      "completions/mean_terminated_length": 323.46875,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "epoch": 0.09613034623217923,
      "grad_norm": 0.12819766906736543,
      "kl": 0.61492919921875,
      "learning_rate": 1e-06,
      "loss": 0.0043,
      "num_tokens": 6407570.0,
      "reward": 5.4765625,
      "reward_std": 1.0316227674484253,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.4375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8515625,
      "step": 236
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 847.0,
      "completions/max_terminated_length": 847.0,
      "completions/mean_length": 464.125,
      "completions/mean_terminated_length": 464.125,
      "completions/min_length": 236.0,
      "completions/min_terminated_length": 236.0,
      "epoch": 0.09653767820773931,
      "grad_norm": 0.1752475302003345,
      "kl": 0.0914306640625,
      "learning_rate": 1e-06,
      "loss": -0.008,
      "num_tokens": 6438094.0,
      "reward": 4.5234375,
      "reward_std": 1.512213945388794,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.5234375,
      "step": 237
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1079.0,
      "completions/max_terminated_length": 1079.0,
      "completions/mean_length": 331.5,
      "completions/mean_terminated_length": 331.5,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "epoch": 0.09694501018329939,
      "grad_norm": 0.07139200316241337,
      "kl": 0.04351806640625,
      "learning_rate": 1e-06,
      "loss": 0.0051,
      "num_tokens": 6463982.0,
      "reward": 5.2421875,
      "reward_std": 0.496543288230896,
      "rewards/cargo_build_reward": 0.78125,
      "rewards/cargo_clippy_reward": 0.78125,
      "rewards/cargo_test_reward": 0.5,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.6796875,
      "step": 238
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1147.0,
      "completions/max_terminated_length": 1147.0,
      "completions/mean_length": 422.0625,
      "completions/mean_terminated_length": 422.0625,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "epoch": 0.09735234215885948,
      "grad_norm": 0.1619291541344302,
      "kl": 0.6241455078125,
      "learning_rate": 1e-06,
      "loss": 0.0068,
      "num_tokens": 6492720.0,
      "reward": 4.640625,
      "reward_std": 1.1303948163986206,
      "rewards/cargo_build_reward": 0.59375,
      "rewards/cargo_clippy_reward": 0.59375,
      "rewards/cargo_test_reward": 0.3125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.828125,
      "step": 239
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 710.0,
      "completions/max_terminated_length": 710.0,
      "completions/mean_length": 439.625,
      "completions/mean_terminated_length": 453.80645161290323,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 284.0,
      "epoch": 0.09775967413441955,
      "grad_norm": 0.1570995884574694,
      "kl": 0.5472412109375,
      "learning_rate": 1e-06,
      "loss": -0.0084,
      "num_tokens": 6526062.0,
      "reward": 3.5078125,
      "reward_std": 1.067907691001892,
      "rewards/cargo_build_reward": 0.375,
      "rewards/cargo_clippy_reward": 0.375,
      "rewards/cargo_test_reward": 0.0,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8046875,
      "step": 240
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1010.0,
      "completions/max_terminated_length": 1010.0,
      "completions/mean_length": 359.375,
      "completions/mean_terminated_length": 359.375,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "epoch": 0.09816700610997964,
      "grad_norm": 0.12425362441168204,
      "kl": 0.06414794921875,
      "learning_rate": 1e-06,
      "loss": -0.0018,
      "num_tokens": 6552658.0,
      "reward": 5.015625,
      "reward_std": 1.295576572418213,
      "rewards/cargo_build_reward": 0.71875,
      "rewards/cargo_clippy_reward": 0.71875,
      "rewards/cargo_test_reward": 0.4375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.703125,
      "step": 241
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 889.0,
      "completions/max_terminated_length": 889.0,
      "completions/mean_length": 328.0,
      "completions/mean_terminated_length": 328.0,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "epoch": 0.09857433808553971,
      "grad_norm": 0.11487237634073325,
      "kl": 0.05828857421875,
      "learning_rate": 1e-06,
      "loss": 0.0023,
      "num_tokens": 6578202.0,
      "reward": 5.3046875,
      "reward_std": 1.1267704963684082,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8046875,
      "step": 242
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 821.0,
      "completions/max_terminated_length": 821.0,
      "completions/mean_length": 365.59375,
      "completions/mean_terminated_length": 365.59375,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "epoch": 0.0989816700610998,
      "grad_norm": 0.06695475981025639,
      "kl": 0.4957275390625,
      "learning_rate": 1e-06,
      "loss": -0.0006,
      "num_tokens": 6605173.0,
      "reward": 4.8515625,
      "reward_std": 0.5640454888343811,
      "rewards/cargo_build_reward": 0.9375,
      "rewards/cargo_clippy_reward": 0.9375,
      "rewards/cargo_test_reward": 0.0625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8515625,
      "step": 243
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 609.0,
      "completions/max_terminated_length": 609.0,
      "completions/mean_length": 349.625,
      "completions/mean_terminated_length": 349.625,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "epoch": 0.09938900203665987,
      "grad_norm": 0.14586003600460437,
      "kl": 0.06005859375,
      "learning_rate": 1e-06,
      "loss": 0.0049,
      "num_tokens": 6631137.0,
      "reward": 5.2421875,
      "reward_std": 1.2937591075897217,
      "rewards/cargo_build_reward": 0.71875,
      "rewards/cargo_clippy_reward": 0.6875,
      "rewards/cargo_test_reward": 0.5,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8359375,
      "step": 244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 758.0,
      "completions/max_terminated_length": 758.0,
      "completions/mean_length": 386.1875,
      "completions/mean_terminated_length": 398.64516129032256,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 178.0,
      "epoch": 0.09979633401221996,
      "grad_norm": 0.1485284452112313,
      "kl": 0.14459228515625,
      "learning_rate": 1e-06,
      "loss": -0.0252,
      "num_tokens": 6661599.0,
      "reward": 4.6875,
      "reward_std": 1.3923399448394775,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.3125,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.8125,
      "step": 245
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1019.0,
      "completions/max_terminated_length": 1019.0,
      "completions/mean_length": 398.8125,
      "completions/mean_terminated_length": 398.8125,
      "completions/min_length": 192.0,
      "completions/min_terminated_length": 192.0,
      "epoch": 0.10020366598778004,
      "grad_norm": 0.20333341422047588,
      "kl": 0.19537353515625,
      "learning_rate": 1e-06,
      "loss": 0.0027,
      "num_tokens": 6689665.0,
      "reward": 4.6796875,
      "reward_std": 1.5273429155349731,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8046875,
      "step": 246
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 565.0,
      "completions/max_terminated_length": 565.0,
      "completions/mean_length": 276.40625,
      "completions/mean_terminated_length": 285.3225806451613,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 140.0,
      "epoch": 0.10061099796334012,
      "grad_norm": 0.1406114255986296,
      "kl": 0.095458984375,
      "learning_rate": 1e-06,
      "loss": -0.0096,
      "num_tokens": 6717143.0,
      "reward": 4.6328125,
      "reward_std": 1.586996078491211,
      "rewards/cargo_build_reward": 0.6875,
      "rewards/cargo_clippy_reward": 0.6875,
      "rewards/cargo_test_reward": 0.21875,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.8828125,
      "step": 247
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 614.0,
      "completions/max_terminated_length": 614.0,
      "completions/mean_length": 380.375,
      "completions/mean_terminated_length": 380.375,
      "completions/min_length": 203.0,
      "completions/min_terminated_length": 203.0,
      "epoch": 0.1010183299389002,
      "grad_norm": 0.14199568353692169,
      "kl": 0.06744384765625,
      "learning_rate": 1e-06,
      "loss": 0.0073,
      "num_tokens": 6743987.0,
      "reward": 5.2421875,
      "reward_std": 1.2977920770645142,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.4375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7421875,
      "step": 248
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 736.0,
      "completions/max_terminated_length": 736.0,
      "completions/mean_length": 357.0,
      "completions/mean_terminated_length": 357.0,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "epoch": 0.10142566191446029,
      "grad_norm": 0.07421016421352952,
      "kl": 0.05303955078125,
      "learning_rate": 1e-06,
      "loss": 0.0005,
      "num_tokens": 6770803.0,
      "reward": 4.34375,
      "reward_std": 0.5166193842887878,
      "rewards/cargo_build_reward": 0.71875,
      "rewards/cargo_clippy_reward": 0.71875,
      "rewards/cargo_test_reward": 0.09375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.71875,
      "step": 249
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 875.0,
      "completions/max_terminated_length": 875.0,
      "completions/mean_length": 452.90625,
      "completions/mean_terminated_length": 452.90625,
      "completions/min_length": 227.0,
      "completions/min_terminated_length": 227.0,
      "epoch": 0.10183299389002037,
      "grad_norm": 0.0931240885796066,
      "kl": 0.086395263671875,
      "learning_rate": 1e-06,
      "loss": -0.0007,
      "num_tokens": 6800216.0,
      "reward": 4.21875,
      "reward_std": 0.7194794416427612,
      "rewards/cargo_build_reward": 0.59375,
      "rewards/cargo_clippy_reward": 0.59375,
      "rewards/cargo_test_reward": 0.1875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.65625,
      "step": 250
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 629.0,
      "completions/max_terminated_length": 629.0,
      "completions/mean_length": 390.3125,
      "completions/mean_terminated_length": 390.3125,
      "completions/min_length": 222.0,
      "completions/min_terminated_length": 222.0,
      "epoch": 0.10224032586558045,
      "grad_norm": 0.08245473465062367,
      "kl": 0.0628662109375,
      "learning_rate": 1e-06,
      "loss": 0.0024,
      "num_tokens": 6827746.0,
      "reward": 5.1015625,
      "reward_std": 0.829085111618042,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.21875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9140625,
      "step": 251
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 916.0,
      "completions/max_terminated_length": 916.0,
      "completions/mean_length": 485.78125,
      "completions/mean_terminated_length": 485.78125,
      "completions/min_length": 194.0,
      "completions/min_terminated_length": 194.0,
      "epoch": 0.10264765784114054,
      "grad_norm": 0.114105467050597,
      "kl": 0.035552978515625,
      "learning_rate": 1e-06,
      "loss": 0.0059,
      "num_tokens": 6858803.0,
      "reward": 5.09375,
      "reward_std": 0.9395735859870911,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.53125,
      "step": 252
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 773.0,
      "completions/max_terminated_length": 773.0,
      "completions/mean_length": 375.03125,
      "completions/mean_terminated_length": 375.03125,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "epoch": 0.10305498981670061,
      "grad_norm": 0.08651622443758208,
      "kl": 0.0655517578125,
      "learning_rate": 1e-06,
      "loss": -0.0,
      "num_tokens": 6885972.0,
      "reward": 4.21875,
      "reward_std": 0.8213375806808472,
      "rewards/cargo_build_reward": 0.5625,
      "rewards/cargo_clippy_reward": 0.5625,
      "rewards/cargo_test_reward": 0.1875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.71875,
      "step": 253
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 667.0,
      "completions/max_terminated_length": 667.0,
      "completions/mean_length": 324.8125,
      "completions/mean_terminated_length": 324.8125,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "epoch": 0.1034623217922607,
      "grad_norm": 0.13429322999276266,
      "kl": 0.105712890625,
      "learning_rate": 1e-06,
      "loss": 0.0036,
      "num_tokens": 6911958.0,
      "reward": 4.6640625,
      "reward_std": 1.0780224800109863,
      "rewards/cargo_build_reward": 0.75,
      "rewards/cargo_clippy_reward": 0.71875,
      "rewards/cargo_test_reward": 0.1875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8203125,
      "step": 254
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 565.0,
      "completions/max_terminated_length": 565.0,
      "completions/mean_length": 359.5,
      "completions/mean_terminated_length": 359.5,
      "completions/min_length": 219.0,
      "completions/min_terminated_length": 219.0,
      "epoch": 0.10386965376782077,
      "grad_norm": 0.10690645919088833,
      "kl": 0.058624267578125,
      "learning_rate": 1e-06,
      "loss": -0.0031,
      "num_tokens": 6938582.0,
      "reward": 5.46875,
      "reward_std": 0.9955792427062988,
      "rewards/cargo_build_reward": 0.96875,
      "rewards/cargo_clippy_reward": 0.96875,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.84375,
      "step": 255
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 876.0,
      "completions/max_terminated_length": 876.0,
      "completions/mean_length": 432.65625,
      "completions/mean_terminated_length": 432.65625,
      "completions/min_length": 236.0,
      "completions/min_terminated_length": 236.0,
      "epoch": 0.10427698574338086,
      "grad_norm": 0.13001312245704905,
      "kl": 0.067626953125,
      "learning_rate": 1e-06,
      "loss": 0.0047,
      "num_tokens": 6966995.0,
      "reward": 3.6796875,
      "reward_std": 1.1213611364364624,
      "rewards/cargo_build_reward": 0.40625,
      "rewards/cargo_clippy_reward": 0.40625,
      "rewards/cargo_test_reward": 0.0625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7421875,
      "step": 256
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1419.0,
      "completions/max_terminated_length": 1419.0,
      "completions/mean_length": 372.71875,
      "completions/mean_terminated_length": 372.71875,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "epoch": 0.10468431771894093,
      "grad_norm": 0.11192869953967434,
      "kl": 0.06536865234375,
      "learning_rate": 1e-06,
      "loss": 0.0033,
      "num_tokens": 6994042.0,
      "reward": 4.65625,
      "reward_std": 1.0012075901031494,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.78125,
      "step": 257
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 3202.0,
      "completions/max_terminated_length": 540.0,
      "completions/mean_length": 403.0,
      "completions/mean_terminated_length": 323.1333333333333,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 173.0,
      "epoch": 0.10509164969450102,
      "grad_norm": 0.1265180220715068,
      "kl": 0.129150390625,
      "learning_rate": 1e-06,
      "loss": 0.064,
      "num_tokens": 7025338.0,
      "reward": 4.46875,
      "reward_std": 1.0521612167358398,
      "rewards/cargo_build_reward": 0.625,
      "rewards/cargo_clippy_reward": 0.59375,
      "rewards/cargo_test_reward": 0.25,
      "rewards/code_block_count_reward": 0.9375,
      "rewards/non_empty_reward": 0.9375,
      "rewards/test_block_count_reward": 0.9375,
      "rewards/tests_have_asserts_reward": 0.875,
      "step": 258
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 704.0,
      "completions/max_terminated_length": 704.0,
      "completions/mean_length": 342.375,
      "completions/mean_terminated_length": 342.375,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "epoch": 0.1054989816700611,
      "grad_norm": 0.1285258846222483,
      "kl": 0.093505859375,
      "learning_rate": 1e-06,
      "loss": 0.0064,
      "num_tokens": 7050894.0,
      "reward": 5.03125,
      "reward_std": 1.1087009906768799,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.84375,
      "step": 259
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 862.0,
      "completions/max_terminated_length": 862.0,
      "completions/mean_length": 336.53125,
      "completions/mean_terminated_length": 336.53125,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "epoch": 0.10590631364562118,
      "grad_norm": 0.15302848713988015,
      "kl": 0.21875,
      "learning_rate": 1e-06,
      "loss": 0.0047,
      "num_tokens": 7076143.0,
      "reward": 5.1171875,
      "reward_std": 1.249140977859497,
      "rewards/cargo_build_reward": 0.75,
      "rewards/cargo_clippy_reward": 0.71875,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8984375,
      "step": 260
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 514.0,
      "completions/max_terminated_length": 514.0,
      "completions/mean_length": 310.75,
      "completions/mean_terminated_length": 310.75,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "epoch": 0.10631364562118126,
      "grad_norm": 0.1659479395384913,
      "kl": 0.153564453125,
      "learning_rate": 1e-06,
      "loss": -0.0035,
      "num_tokens": 7100943.0,
      "reward": 5.0625,
      "reward_std": 1.3257355690002441,
      "rewards/cargo_build_reward": 0.78125,
      "rewards/cargo_clippy_reward": 0.78125,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.796875,
      "step": 261
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 502.0,
      "completions/max_terminated_length": 502.0,
      "completions/mean_length": 302.9375,
      "completions/mean_terminated_length": 302.9375,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "epoch": 0.10672097759674135,
      "grad_norm": 0.1403295319168937,
      "kl": 0.0731201171875,
      "learning_rate": 1e-06,
      "loss": -0.0006,
      "num_tokens": 7124325.0,
      "reward": 6.0078125,
      "reward_std": 1.2643563747406006,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.75,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7421875,
      "step": 262
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 849.0,
      "completions/max_terminated_length": 849.0,
      "completions/mean_length": 404.65625,
      "completions/mean_terminated_length": 404.65625,
      "completions/min_length": 239.0,
      "completions/min_terminated_length": 239.0,
      "epoch": 0.10712830957230142,
      "grad_norm": 0.13674927081734936,
      "kl": 0.04754638671875,
      "learning_rate": 1e-06,
      "loss": 0.0064,
      "num_tokens": 7152450.0,
      "reward": 5.1484375,
      "reward_std": 1.1286545991897583,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.25,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8359375,
      "step": 263
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 626.0,
      "completions/max_terminated_length": 626.0,
      "completions/mean_length": 398.6875,
      "completions/mean_terminated_length": 398.6875,
      "completions/min_length": 218.0,
      "completions/min_terminated_length": 218.0,
      "epoch": 0.10753564154786151,
      "grad_norm": 0.18572905209539947,
      "kl": 0.1798095703125,
      "learning_rate": 1e-06,
      "loss": 0.0075,
      "num_tokens": 7180888.0,
      "reward": 4.5078125,
      "reward_std": 1.3470971584320068,
      "rewards/cargo_build_reward": 0.71875,
      "rewards/cargo_clippy_reward": 0.6875,
      "rewards/cargo_test_reward": 0.1875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.7421875,
      "step": 264
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 845.0,
      "completions/max_terminated_length": 845.0,
      "completions/mean_length": 368.78125,
      "completions/mean_terminated_length": 368.78125,
      "completions/min_length": 201.0,
      "completions/min_terminated_length": 201.0,
      "epoch": 0.1079429735234216,
      "grad_norm": 0.15649574599231655,
      "kl": 0.49713134765625,
      "learning_rate": 1e-06,
      "loss": 0.0062,
      "num_tokens": 7207641.0,
      "reward": 5.4609375,
      "reward_std": 1.333330750465393,
      "rewards/cargo_build_reward": 0.78125,
      "rewards/cargo_clippy_reward": 0.78125,
      "rewards/cargo_test_reward": 0.53125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8359375,
      "step": 265
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 447.0,
      "completions/max_terminated_length": 447.0,
      "completions/mean_length": 295.0625,
      "completions/mean_terminated_length": 295.0625,
      "completions/min_length": 215.0,
      "completions/min_terminated_length": 215.0,
      "epoch": 0.10835030549898167,
      "grad_norm": 0.08622051055470983,
      "kl": 0.113525390625,
      "learning_rate": 1e-06,
      "loss": 0.0019,
      "num_tokens": 7232843.0,
      "reward": 6.0859375,
      "reward_std": 0.3644236922264099,
      "rewards/cargo_build_reward": 0.96875,
      "rewards/cargo_clippy_reward": 0.96875,
      "rewards/cargo_test_reward": 0.59375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9609375,
      "step": 266
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 344.0,
      "completions/max_terminated_length": 344.0,
      "completions/mean_length": 248.40625,
      "completions/mean_terminated_length": 248.40625,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "epoch": 0.10875763747454176,
      "grad_norm": 0.0884616838385181,
      "kl": 0.23699951171875,
      "learning_rate": 1e-06,
      "loss": -0.0005,
      "num_tokens": 7255488.0,
      "reward": 6.1171875,
      "reward_std": 0.9955595135688782,
      "rewards/cargo_build_reward": 0.9375,
      "rewards/cargo_clippy_reward": 0.9375,
      "rewards/cargo_test_reward": 0.6875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8671875,
      "step": 267
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 882.0,
      "completions/max_terminated_length": 882.0,
      "completions/mean_length": 401.09375,
      "completions/mean_terminated_length": 401.09375,
      "completions/min_length": 221.0,
      "completions/min_terminated_length": 221.0,
      "epoch": 0.10916496945010183,
      "grad_norm": 0.15097697805078084,
      "kl": 0.169158935546875,
      "learning_rate": 1e-06,
      "loss": -0.0061,
      "num_tokens": 7282555.0,
      "reward": 4.890625,
      "reward_std": 1.341402292251587,
      "rewards/cargo_build_reward": 0.75,
      "rewards/cargo_clippy_reward": 0.75,
      "rewards/cargo_test_reward": 0.21875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.953125,
      "step": 268
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 787.0,
      "completions/max_terminated_length": 787.0,
      "completions/mean_length": 352.65625,
      "completions/mean_terminated_length": 352.65625,
      "completions/min_length": 204.0,
      "completions/min_terminated_length": 204.0,
      "epoch": 0.10957230142566192,
      "grad_norm": 0.1561985323559366,
      "kl": 0.1107177734375,
      "learning_rate": 1e-06,
      "loss": 0.0128,
      "num_tokens": 7308656.0,
      "reward": 4.765625,
      "reward_std": 1.2907824516296387,
      "rewards/cargo_build_reward": 0.625,
      "rewards/cargo_clippy_reward": 0.625,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.765625,
      "step": 269
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 764.0,
      "completions/max_terminated_length": 764.0,
      "completions/mean_length": 326.4375,
      "completions/mean_terminated_length": 326.4375,
      "completions/min_length": 208.0,
      "completions/min_terminated_length": 208.0,
      "epoch": 0.109979633401222,
      "grad_norm": 0.17435205466011472,
      "kl": 0.2305908203125,
      "learning_rate": 1e-06,
      "loss": 0.0046,
      "num_tokens": 7334062.0,
      "reward": 4.9609375,
      "reward_std": 1.5009129047393799,
      "rewards/cargo_build_reward": 0.75,
      "rewards/cargo_clippy_reward": 0.75,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7109375,
      "step": 270
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 713.0,
      "completions/max_terminated_length": 713.0,
      "completions/mean_length": 441.21875,
      "completions/mean_terminated_length": 441.21875,
      "completions/min_length": 207.0,
      "completions/min_terminated_length": 207.0,
      "epoch": 0.11038696537678208,
      "grad_norm": 0.16387973109810397,
      "kl": 0.10418701171875,
      "learning_rate": 1e-06,
      "loss": 0.0035,
      "num_tokens": 7363685.0,
      "reward": 4.5,
      "reward_std": 1.1646424531936646,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9375,
      "step": 271
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 607.0,
      "completions/max_terminated_length": 607.0,
      "completions/mean_length": 268.625,
      "completions/mean_terminated_length": 268.625,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "epoch": 0.11079429735234216,
      "grad_norm": 0.1119476393435259,
      "kl": 0.447021484375,
      "learning_rate": 1e-06,
      "loss": -0.0036,
      "num_tokens": 7387521.0,
      "reward": 4.546875,
      "reward_std": 1.0633394718170166,
      "rewards/cargo_build_reward": 0.625,
      "rewards/cargo_clippy_reward": 0.625,
      "rewards/cargo_test_reward": 0.25,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.796875,
      "step": 272
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 787.0,
      "completions/max_terminated_length": 787.0,
      "completions/mean_length": 382.90625,
      "completions/mean_terminated_length": 382.90625,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "epoch": 0.11120162932790224,
      "grad_norm": 0.1262634028903118,
      "kl": 0.6927490234375,
      "learning_rate": 1e-06,
      "loss": 0.0014,
      "num_tokens": 7414870.0,
      "reward": 4.5625,
      "reward_std": 0.5277684926986694,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.0,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8125,
      "step": 273
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 635.0,
      "completions/max_terminated_length": 635.0,
      "completions/mean_length": 357.34375,
      "completions/mean_terminated_length": 357.34375,
      "completions/min_length": 187.0,
      "completions/min_terminated_length": 187.0,
      "epoch": 0.11160896130346232,
      "grad_norm": 4.553639124987937,
      "kl": 54.0347900390625,
      "learning_rate": 1e-06,
      "loss": 0.0041,
      "num_tokens": 7441273.0,
      "reward": 5.125,
      "reward_std": 1.1450188159942627,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.3125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8125,
      "step": 274
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 536.0,
      "completions/max_terminated_length": 536.0,
      "completions/mean_length": 334.3125,
      "completions/mean_terminated_length": 345.0967741935484,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 224.0,
      "epoch": 0.1120162932790224,
      "grad_norm": 1.8865599360062382,
      "kl": 26.4600830078125,
      "learning_rate": 1e-06,
      "loss": -0.0058,
      "num_tokens": 7471175.0,
      "reward": 4.328125,
      "reward_std": 1.3908053636550903,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.1875,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.703125,
      "step": 275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 823.0,
      "completions/max_terminated_length": 823.0,
      "completions/mean_length": 467.71875,
      "completions/mean_terminated_length": 467.71875,
      "completions/min_length": 211.0,
      "completions/min_terminated_length": 211.0,
      "epoch": 0.11242362525458248,
      "grad_norm": 0.14878318328382348,
      "kl": 0.144287109375,
      "learning_rate": 1e-06,
      "loss": 0.0033,
      "num_tokens": 7501630.0,
      "reward": 4.5,
      "reward_std": 1.01895272731781,
      "rewards/cargo_build_reward": 0.59375,
      "rewards/cargo_clippy_reward": 0.625,
      "rewards/cargo_test_reward": 0.1875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.90625,
      "step": 276
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1442.0,
      "completions/max_terminated_length": 1442.0,
      "completions/mean_length": 625.5,
      "completions/mean_terminated_length": 625.5,
      "completions/min_length": 200.0,
      "completions/min_terminated_length": 200.0,
      "epoch": 0.11283095723014257,
      "grad_norm": 0.16104492485108915,
      "kl": 0.12481689453125,
      "learning_rate": 1e-06,
      "loss": 0.0026,
      "num_tokens": 7536238.0,
      "reward": 3.953125,
      "reward_std": 1.168134093284607,
      "rewards/cargo_build_reward": 0.46875,
      "rewards/cargo_clippy_reward": 0.46875,
      "rewards/cargo_test_reward": 0.0625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.90625,
      "step": 277
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1489.0,
      "completions/max_terminated_length": 1489.0,
      "completions/mean_length": 455.0625,
      "completions/mean_terminated_length": 455.0625,
      "completions/min_length": 211.0,
      "completions/min_terminated_length": 211.0,
      "epoch": 0.11323828920570264,
      "grad_norm": 0.19205069080876094,
      "kl": 0.22412109375,
      "learning_rate": 1e-06,
      "loss": -0.0088,
      "num_tokens": 7566120.0,
      "reward": 4.09375,
      "reward_std": 1.2660152912139893,
      "rewards/cargo_build_reward": 0.53125,
      "rewards/cargo_clippy_reward": 0.53125,
      "rewards/cargo_test_reward": 0.09375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.84375,
      "step": 278
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 503.0,
      "completions/max_terminated_length": 503.0,
      "completions/mean_length": 282.9375,
      "completions/mean_terminated_length": 282.9375,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "epoch": 0.11364562118126273,
      "grad_norm": 0.1483138613685051,
      "kl": 0.246337890625,
      "learning_rate": 1e-06,
      "loss": 0.0003,
      "num_tokens": 7589582.0,
      "reward": 5.0234375,
      "reward_std": 1.3213204145431519,
      "rewards/cargo_build_reward": 0.78125,
      "rewards/cargo_clippy_reward": 0.78125,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.7265625,
      "step": 279
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 649.0,
      "completions/max_terminated_length": 649.0,
      "completions/mean_length": 321.71875,
      "completions/mean_terminated_length": 321.71875,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "epoch": 0.11405295315682282,
      "grad_norm": 0.11701109651705693,
      "kl": 0.1343994140625,
      "learning_rate": 1e-06,
      "loss": 0.0033,
      "num_tokens": 7615005.0,
      "reward": 5.6953125,
      "reward_std": 0.9445090293884277,
      "rewards/cargo_build_reward": 0.78125,
      "rewards/cargo_clippy_reward": 0.78125,
      "rewards/cargo_test_reward": 0.625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8828125,
      "step": 280
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 587.0,
      "completions/max_terminated_length": 587.0,
      "completions/mean_length": 323.8125,
      "completions/mean_terminated_length": 323.8125,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "epoch": 0.11446028513238289,
      "grad_norm": 0.08937033137104666,
      "kl": 0.0780029296875,
      "learning_rate": 1e-06,
      "loss": -0.0012,
      "num_tokens": 7640079.0,
      "reward": 5.6484375,
      "reward_std": 0.7428663372993469,
      "rewards/cargo_build_reward": 0.9375,
      "rewards/cargo_clippy_reward": 0.9375,
      "rewards/cargo_test_reward": 0.4375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8984375,
      "step": 281
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 531.0,
      "completions/max_terminated_length": 531.0,
      "completions/mean_length": 221.875,
      "completions/mean_terminated_length": 221.875,
      "completions/min_length": 72.0,
      "completions/min_terminated_length": 72.0,
      "epoch": 0.11486761710794298,
      "grad_norm": 0.106298534495795,
      "kl": 0.39532470703125,
      "learning_rate": 1e-06,
      "loss": 0.0052,
      "num_tokens": 7661979.0,
      "reward": 5.75,
      "reward_std": 1.0583428144454956,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.59375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.75,
      "step": 282
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 662.0,
      "completions/max_terminated_length": 662.0,
      "completions/mean_length": 403.3125,
      "completions/mean_terminated_length": 403.3125,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "epoch": 0.11527494908350305,
      "grad_norm": 0.180058683124987,
      "kl": 0.06756591796875,
      "learning_rate": 1e-06,
      "loss": 0.0003,
      "num_tokens": 7689765.0,
      "reward": 5.15625,
      "reward_std": 1.1191527843475342,
      "rewards/cargo_build_reward": 0.78125,
      "rewards/cargo_clippy_reward": 0.78125,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.890625,
      "step": 283
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 594.0,
      "completions/max_terminated_length": 594.0,
      "completions/mean_length": 264.75,
      "completions/mean_terminated_length": 273.2903225806452,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 126.0,
      "epoch": 0.11568228105906314,
      "grad_norm": 0.17170194624800517,
      "kl": 2.3291015625,
      "learning_rate": 1e-06,
      "loss": -0.0113,
      "num_tokens": 7716337.0,
      "reward": 5.2109375,
      "reward_std": 1.13656485080719,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.8359375,
      "step": 284
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 671.0,
      "completions/max_terminated_length": 671.0,
      "completions/mean_length": 379.5625,
      "completions/mean_terminated_length": 379.5625,
      "completions/min_length": 195.0,
      "completions/min_terminated_length": 195.0,
      "epoch": 0.11608961303462322,
      "grad_norm": 0.13016667132985651,
      "kl": 0.38629150390625,
      "learning_rate": 1e-06,
      "loss": 0.0006,
      "num_tokens": 7744059.0,
      "reward": 4.5078125,
      "reward_std": 1.093302607536316,
      "rewards/cargo_build_reward": 0.6875,
      "rewards/cargo_clippy_reward": 0.6875,
      "rewards/cargo_test_reward": 0.15625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8203125,
      "step": 285
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 969.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 324.125,
      "completions/mean_terminated_length": 324.125,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "epoch": 0.1164969450101833,
      "grad_norm": 0.13264994720706488,
      "kl": 0.1514892578125,
      "learning_rate": 1e-06,
      "loss": 0.018,
      "num_tokens": 7770631.0,
      "reward": 5.15625,
      "reward_std": 1.1100351810455322,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.71875,
      "step": 286
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 572.0,
      "completions/max_terminated_length": 572.0,
      "completions/mean_length": 350.28125,
      "completions/mean_terminated_length": 350.28125,
      "completions/min_length": 237.0,
      "completions/min_terminated_length": 237.0,
      "epoch": 0.11690427698574338,
      "grad_norm": 0.14118972283961445,
      "kl": 0.1871337890625,
      "learning_rate": 1e-06,
      "loss": -0.0001,
      "num_tokens": 7796424.0,
      "reward": 4.578125,
      "reward_std": 1.092275619506836,
      "rewards/cargo_build_reward": 0.625,
      "rewards/cargo_clippy_reward": 0.625,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.6875,
      "step": 287
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 613.0,
      "completions/max_terminated_length": 613.0,
      "completions/mean_length": 302.90625,
      "completions/mean_terminated_length": 302.90625,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "epoch": 0.11731160896130347,
      "grad_norm": 0.1515250622599137,
      "kl": 0.2572021484375,
      "learning_rate": 1e-06,
      "loss": 0.0008,
      "num_tokens": 7820885.0,
      "reward": 5.53125,
      "reward_std": 1.1256417036056519,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.78125,
      "rewards/cargo_test_reward": 0.625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.6875,
      "step": 288
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 606.0,
      "completions/max_terminated_length": 606.0,
      "completions/mean_length": 335.5,
      "completions/mean_terminated_length": 335.5,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "epoch": 0.11771894093686354,
      "grad_norm": 0.1885785813799223,
      "kl": 0.2010498046875,
      "learning_rate": 1e-06,
      "loss": 0.0049,
      "num_tokens": 7845949.0,
      "reward": 5.046875,
      "reward_std": 1.400909185409546,
      "rewards/cargo_build_reward": 0.75,
      "rewards/cargo_clippy_reward": 0.75,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.796875,
      "step": 289
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 584.0,
      "completions/max_terminated_length": 584.0,
      "completions/mean_length": 349.03125,
      "completions/mean_terminated_length": 349.03125,
      "completions/min_length": 220.0,
      "completions/min_terminated_length": 220.0,
      "epoch": 0.11812627291242363,
      "grad_norm": 0.15741667715465132,
      "kl": 0.1563720703125,
      "learning_rate": 1e-06,
      "loss": 0.0025,
      "num_tokens": 7871958.0,
      "reward": 4.7578125,
      "reward_std": 1.3605859279632568,
      "rewards/cargo_build_reward": 0.625,
      "rewards/cargo_clippy_reward": 0.625,
      "rewards/cargo_test_reward": 0.3125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8828125,
      "step": 290
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 745.0,
      "completions/max_terminated_length": 745.0,
      "completions/mean_length": 374.65625,
      "completions/mean_terminated_length": 374.65625,
      "completions/min_length": 181.0,
      "completions/min_terminated_length": 181.0,
      "epoch": 0.1185336048879837,
      "grad_norm": 0.16564801145243255,
      "kl": 0.73828125,
      "learning_rate": 1e-06,
      "loss": -0.0035,
      "num_tokens": 7899091.0,
      "reward": 4.3203125,
      "reward_std": 1.1249115467071533,
      "rewards/cargo_build_reward": 0.46875,
      "rewards/cargo_clippy_reward": 0.46875,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8203125,
      "step": 291
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1136.0,
      "completions/max_terminated_length": 1136.0,
      "completions/mean_length": 460.03125,
      "completions/mean_terminated_length": 460.03125,
      "completions/min_length": 197.0,
      "completions/min_terminated_length": 197.0,
      "epoch": 0.11894093686354379,
      "grad_norm": 0.9810150213275214,
      "kl": 8.68408203125,
      "learning_rate": 1e-06,
      "loss": 0.007,
      "num_tokens": 7929436.0,
      "reward": 4.6640625,
      "reward_std": 1.204304814338684,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7890625,
      "step": 292
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 556.0,
      "completions/max_terminated_length": 556.0,
      "completions/mean_length": 333.25,
      "completions/mean_terminated_length": 333.25,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "epoch": 0.11934826883910386,
      "grad_norm": 0.13758647833349244,
      "kl": 0.2254638671875,
      "learning_rate": 1e-06,
      "loss": 0.0032,
      "num_tokens": 7955676.0,
      "reward": 5.0234375,
      "reward_std": 1.1061217784881592,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.75,
      "rewards/cargo_test_reward": 0.3125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7734375,
      "step": 293
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 593.0,
      "completions/max_terminated_length": 593.0,
      "completions/mean_length": 334.875,
      "completions/mean_terminated_length": 334.875,
      "completions/min_length": 187.0,
      "completions/min_terminated_length": 187.0,
      "epoch": 0.11975560081466395,
      "grad_norm": 0.32509081149452707,
      "kl": 4.1324462890625,
      "learning_rate": 1e-06,
      "loss": 0.0102,
      "num_tokens": 7981536.0,
      "reward": 5.1484375,
      "reward_std": 1.2189942598342896,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.8515625,
      "step": 294
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 940.0,
      "completions/max_terminated_length": 940.0,
      "completions/mean_length": 514.75,
      "completions/mean_terminated_length": 514.75,
      "completions/min_length": 224.0,
      "completions/min_terminated_length": 224.0,
      "epoch": 0.12016293279022404,
      "grad_norm": 0.16648108780140852,
      "kl": 0.33056640625,
      "learning_rate": 1e-06,
      "loss": -0.0022,
      "num_tokens": 8012952.0,
      "reward": 4.765625,
      "reward_std": 1.218648910522461,
      "rewards/cargo_build_reward": 0.71875,
      "rewards/cargo_clippy_reward": 0.71875,
      "rewards/cargo_test_reward": 0.21875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.890625,
      "step": 295
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 565.0,
      "completions/max_terminated_length": 565.0,
      "completions/mean_length": 295.84375,
      "completions/mean_terminated_length": 305.38709677419354,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 209.0,
      "epoch": 0.12057026476578411,
      "grad_norm": 0.11883015365571581,
      "kl": 0.2327880859375,
      "learning_rate": 1e-06,
      "loss": -0.0151,
      "num_tokens": 8041419.0,
      "reward": 5.375,
      "reward_std": 0.748497724533081,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.40625,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.8125,
      "step": 296
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 660.0,
      "completions/max_terminated_length": 660.0,
      "completions/mean_length": 289.34375,
      "completions/mean_terminated_length": 289.34375,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "epoch": 0.1209775967413442,
      "grad_norm": 0.10458427429280603,
      "kl": 0.536865234375,
      "learning_rate": 1e-06,
      "loss": -0.0069,
      "num_tokens": 8065958.0,
      "reward": 4.609375,
      "reward_std": 0.9699586033821106,
      "rewards/cargo_build_reward": 0.75,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.3125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.578125,
      "step": 297
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1006.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 492.1875,
      "completions/mean_terminated_length": 492.1875,
      "completions/min_length": 191.0,
      "completions/min_terminated_length": 191.0,
      "epoch": 0.12138492871690428,
      "grad_norm": 0.14186188295209,
      "kl": 0.09375,
      "learning_rate": 1e-06,
      "loss": -0.0004,
      "num_tokens": 8097228.0,
      "reward": 4.8671875,
      "reward_std": 1.137942910194397,
      "rewards/cargo_build_reward": 0.71875,
      "rewards/cargo_clippy_reward": 0.71875,
      "rewards/cargo_test_reward": 0.3125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8046875,
      "step": 298
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 663.0,
      "completions/max_terminated_length": 663.0,
      "completions/mean_length": 321.15625,
      "completions/mean_terminated_length": 321.15625,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "epoch": 0.12179226069246436,
      "grad_norm": 0.09821785497127014,
      "kl": 0.302734375,
      "learning_rate": 1e-06,
      "loss": 0.0014,
      "num_tokens": 8121425.0,
      "reward": 5.4453125,
      "reward_std": 0.7561496496200562,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.46875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8203125,
      "step": 299
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 718.0,
      "completions/max_terminated_length": 718.0,
      "completions/mean_length": 346.0625,
      "completions/mean_terminated_length": 346.0625,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "epoch": 0.12219959266802444,
      "grad_norm": 0.17510460830680546,
      "kl": 0.271728515625,
      "learning_rate": 1e-06,
      "loss": 0.0003,
      "num_tokens": 8147603.0,
      "reward": 4.9453125,
      "reward_std": 1.1475259065628052,
      "rewards/cargo_build_reward": 0.78125,
      "rewards/cargo_clippy_reward": 0.78125,
      "rewards/cargo_test_reward": 0.25,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8828125,
      "step": 300
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 693.0,
      "completions/max_terminated_length": 693.0,
      "completions/mean_length": 305.59375,
      "completions/mean_terminated_length": 305.59375,
      "completions/min_length": 38.0,
      "completions/min_terminated_length": 38.0,
      "epoch": 0.12260692464358453,
      "grad_norm": 0.12321578572372362,
      "kl": 0.157958984375,
      "learning_rate": 1e-06,
      "loss": 0.0021,
      "num_tokens": 8172142.0,
      "reward": 4.53125,
      "reward_std": 0.9470474123954773,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.9375,
      "rewards/tests_have_asserts_reward": 0.71875,
      "step": 301
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 685.0,
      "completions/max_terminated_length": 685.0,
      "completions/mean_length": 368.46875,
      "completions/mean_terminated_length": 368.46875,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "epoch": 0.1230142566191446,
      "grad_norm": 0.14119045615995254,
      "kl": 0.05419921875,
      "learning_rate": 1e-06,
      "loss": 0.0085,
      "num_tokens": 8199053.0,
      "reward": 5.25,
      "reward_std": 0.6313312649726868,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.3125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.875,
      "step": 302
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 493.0,
      "completions/max_terminated_length": 493.0,
      "completions/mean_length": 291.25,
      "completions/mean_terminated_length": 291.25,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "epoch": 0.12342158859470469,
      "grad_norm": 0.1190369601319705,
      "kl": 0.06005859375,
      "learning_rate": 1e-06,
      "loss": 0.0013,
      "num_tokens": 8223021.0,
      "reward": 5.34375,
      "reward_std": 1.0053589344024658,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.84375,
      "step": 303
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 729.0,
      "completions/max_terminated_length": 729.0,
      "completions/mean_length": 278.59375,
      "completions/mean_terminated_length": 278.59375,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "epoch": 0.12382892057026476,
      "grad_norm": 0.152233616540164,
      "kl": 1.1922607421875,
      "learning_rate": 1e-06,
      "loss": -0.0056,
      "num_tokens": 8246048.0,
      "reward": 5.7890625,
      "reward_std": 0.576694667339325,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.59375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9140625,
      "step": 304
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 433.0,
      "completions/max_terminated_length": 433.0,
      "completions/mean_length": 233.9375,
      "completions/mean_terminated_length": 233.9375,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "epoch": 0.12423625254582485,
      "grad_norm": 0.08187401157892721,
      "kl": 0.1328125,
      "learning_rate": 1e-06,
      "loss": -0.0029,
      "num_tokens": 8268582.0,
      "reward": 6.125,
      "reward_std": 0.8295804858207703,
      "rewards/cargo_build_reward": 1.0,
      "rewards/cargo_clippy_reward": 1.0,
      "rewards/cargo_test_reward": 0.6875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.75,
      "step": 305
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 402.0,
      "completions/max_terminated_length": 402.0,
      "completions/mean_length": 295.46875,
      "completions/mean_terminated_length": 295.46875,
      "completions/min_length": 211.0,
      "completions/min_terminated_length": 211.0,
      "epoch": 0.12464358452138492,
      "grad_norm": 0.11004194557650704,
      "kl": 0.040069580078125,
      "learning_rate": 1e-06,
      "loss": -0.0016,
      "num_tokens": 8293229.0,
      "reward": 6.21875,
      "reward_std": 0.8219360113143921,
      "rewards/cargo_build_reward": 0.96875,
      "rewards/cargo_clippy_reward": 0.96875,
      "rewards/cargo_test_reward": 0.71875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.84375,
      "step": 306
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 920.0,
      "completions/max_terminated_length": 920.0,
      "completions/mean_length": 361.375,
      "completions/mean_terminated_length": 361.375,
      "completions/min_length": 74.0,
      "completions/min_terminated_length": 74.0,
      "epoch": 0.125050916496945,
      "grad_norm": 0.10735721194718409,
      "kl": 0.5159912109375,
      "learning_rate": 1e-06,
      "loss": 0.0039,
      "num_tokens": 8319873.0,
      "reward": 5.25,
      "reward_std": 0.7978966236114502,
      "rewards/cargo_build_reward": 0.9375,
      "rewards/cargo_clippy_reward": 0.9375,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.6875,
      "step": 307
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 682.0,
      "completions/max_terminated_length": 682.0,
      "completions/mean_length": 422.0,
      "completions/mean_terminated_length": 422.0,
      "completions/min_length": 299.0,
      "completions/min_terminated_length": 299.0,
      "epoch": 0.12545824847250509,
      "grad_norm": 0.5269721152618081,
      "kl": 4.53875732421875,
      "learning_rate": 1e-06,
      "loss": 0.0001,
      "num_tokens": 8348729.0,
      "reward": 4.984375,
      "reward_std": 1.289331316947937,
      "rewards/cargo_build_reward": 0.75,
      "rewards/cargo_clippy_reward": 0.75,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.921875,
      "step": 308
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 535.0,
      "completions/max_terminated_length": 535.0,
      "completions/mean_length": 323.15625,
      "completions/mean_terminated_length": 323.15625,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "epoch": 0.12586558044806517,
      "grad_norm": 0.14390950392071175,
      "kl": 0.1331787109375,
      "learning_rate": 1e-06,
      "loss": 0.0029,
      "num_tokens": 8373958.0,
      "reward": 5.1796875,
      "reward_std": 0.7235574722290039,
      "rewards/cargo_build_reward": 0.96875,
      "rewards/cargo_clippy_reward": 0.96875,
      "rewards/cargo_test_reward": 0.21875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.8203125,
      "step": 309
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 680.0,
      "completions/max_terminated_length": 680.0,
      "completions/mean_length": 356.75,
      "completions/mean_terminated_length": 356.75,
      "completions/min_length": 208.0,
      "completions/min_terminated_length": 208.0,
      "epoch": 0.12627291242362526,
      "grad_norm": 0.13907683225817433,
      "kl": 0.04400634765625,
      "learning_rate": 1e-06,
      "loss": -0.0061,
      "num_tokens": 8400790.0,
      "reward": 5.390625,
      "reward_std": 1.155860185623169,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.984375,
      "step": 310
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 771.0,
      "completions/max_terminated_length": 771.0,
      "completions/mean_length": 297.96875,
      "completions/mean_terminated_length": 297.96875,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "epoch": 0.12668024439918535,
      "grad_norm": 0.08076983390044086,
      "kl": 0.21533203125,
      "learning_rate": 1e-06,
      "loss": -0.002,
      "num_tokens": 8425317.0,
      "reward": 5.8359375,
      "reward_std": 0.5163742303848267,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.71875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7421875,
      "step": 311
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 523.0,
      "completions/max_terminated_length": 523.0,
      "completions/mean_length": 312.65625,
      "completions/mean_terminated_length": 312.65625,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "epoch": 0.1270875763747454,
      "grad_norm": 0.14598538570078823,
      "kl": 0.095458984375,
      "learning_rate": 1e-06,
      "loss": 0.0057,
      "num_tokens": 8450338.0,
      "reward": 4.7265625,
      "reward_std": 0.9958631992340088,
      "rewards/cargo_build_reward": 0.78125,
      "rewards/cargo_clippy_reward": 0.78125,
      "rewards/cargo_test_reward": 0.125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9140625,
      "step": 312
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 482.0,
      "completions/max_terminated_length": 482.0,
      "completions/mean_length": 269.09375,
      "completions/mean_terminated_length": 269.09375,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "epoch": 0.1274949083503055,
      "grad_norm": 0.148913448446011,
      "kl": 0.2449951171875,
      "learning_rate": 1e-06,
      "loss": 0.0014,
      "num_tokens": 8473789.0,
      "reward": 5.484375,
      "reward_std": 1.3874802589416504,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.5,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.734375,
      "step": 313
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 893.0,
      "completions/max_terminated_length": 893.0,
      "completions/mean_length": 344.84375,
      "completions/mean_terminated_length": 344.84375,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "epoch": 0.12790224032586558,
      "grad_norm": 0.1460142084296301,
      "kl": 0.1802978515625,
      "learning_rate": 1e-06,
      "loss": 0.0011,
      "num_tokens": 8499832.0,
      "reward": 5.7421875,
      "reward_std": 0.72386634349823,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.6875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.6796875,
      "step": 314
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 953.0,
      "completions/max_terminated_length": 953.0,
      "completions/mean_length": 406.53125,
      "completions/mean_terminated_length": 406.53125,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "epoch": 0.12830957230142567,
      "grad_norm": 0.15246647434109337,
      "kl": 0.2283935546875,
      "learning_rate": 1e-06,
      "loss": -0.0006,
      "num_tokens": 8527793.0,
      "reward": 4.6640625,
      "reward_std": 1.1216604709625244,
      "rewards/cargo_build_reward": 0.75,
      "rewards/cargo_clippy_reward": 0.71875,
      "rewards/cargo_test_reward": 0.21875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7578125,
      "step": 315
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 566.0,
      "completions/max_terminated_length": 566.0,
      "completions/mean_length": 342.34375,
      "completions/mean_terminated_length": 342.34375,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 205.0,
      "epoch": 0.12871690427698573,
      "grad_norm": 0.1582283452943812,
      "kl": 0.1551513671875,
      "learning_rate": 1e-06,
      "loss": 0.003,
      "num_tokens": 8554076.0,
      "reward": 5.3359375,
      "reward_std": 1.1304055452346802,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.4375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7734375,
      "step": 316
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1441.0,
      "completions/max_terminated_length": 1441.0,
      "completions/mean_length": 500.71875,
      "completions/mean_terminated_length": 500.71875,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "epoch": 0.12912423625254582,
      "grad_norm": 0.1841685716627463,
      "kl": 0.128814697265625,
      "learning_rate": 1e-06,
      "loss": 0.0082,
      "num_tokens": 8585875.0,
      "reward": 4.828125,
      "reward_std": 1.1464003324508667,
      "rewards/cargo_build_reward": 0.71875,
      "rewards/cargo_clippy_reward": 0.71875,
      "rewards/cargo_test_reward": 0.3125,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.78125,
      "step": 317
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 631.0,
      "completions/max_terminated_length": 631.0,
      "completions/mean_length": 403.90625,
      "completions/mean_terminated_length": 403.90625,
      "completions/min_length": 263.0,
      "completions/min_terminated_length": 263.0,
      "epoch": 0.1295315682281059,
      "grad_norm": 0.17590877975865102,
      "kl": 0.82269287109375,
      "learning_rate": 1e-06,
      "loss": 0.0019,
      "num_tokens": 8614400.0,
      "reward": 5.015625,
      "reward_std": 1.1951438188552856,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.25,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.890625,
      "step": 318
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 812.0,
      "completions/max_terminated_length": 812.0,
      "completions/mean_length": 428.0,
      "completions/mean_terminated_length": 428.0,
      "completions/min_length": 255.0,
      "completions/min_terminated_length": 255.0,
      "epoch": 0.129938900203666,
      "grad_norm": 0.15800969865178677,
      "kl": 0.3536376953125,
      "learning_rate": 1e-06,
      "loss": 0.0045,
      "num_tokens": 8642992.0,
      "reward": 4.6796875,
      "reward_std": 1.075789213180542,
      "rewards/cargo_build_reward": 0.75,
      "rewards/cargo_clippy_reward": 0.75,
      "rewards/cargo_test_reward": 0.125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9296875,
      "step": 319
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 963.0,
      "completions/max_terminated_length": 963.0,
      "completions/mean_length": 574.84375,
      "completions/mean_terminated_length": 574.84375,
      "completions/min_length": 337.0,
      "completions/min_terminated_length": 337.0,
      "epoch": 0.13034623217922606,
      "grad_norm": 0.16975018817715407,
      "kl": 0.6793212890625,
      "learning_rate": 1e-06,
      "loss": 0.0015,
      "num_tokens": 8676963.0,
      "reward": 3.9140625,
      "reward_std": 1.0297878980636597,
      "rewards/cargo_build_reward": 0.5,
      "rewards/cargo_clippy_reward": 0.5,
      "rewards/cargo_test_reward": 0.0,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9140625,
      "step": 320
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 851.0,
      "completions/max_terminated_length": 851.0,
      "completions/mean_length": 295.84375,
      "completions/mean_terminated_length": 295.84375,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "epoch": 0.13075356415478614,
      "grad_norm": 0.12735954359973728,
      "kl": 0.1572265625,
      "learning_rate": 1e-06,
      "loss": 0.0079,
      "num_tokens": 8700990.0,
      "reward": 5.609375,
      "reward_std": 1.1104497909545898,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.5,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.921875,
      "step": 321
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 819.0,
      "completions/max_terminated_length": 819.0,
      "completions/mean_length": 365.78125,
      "completions/mean_terminated_length": 365.78125,
      "completions/min_length": 194.0,
      "completions/min_terminated_length": 194.0,
      "epoch": 0.13116089613034623,
      "grad_norm": 0.1910300889247685,
      "kl": 1.3275146484375,
      "learning_rate": 1e-06,
      "loss": 0.0156,
      "num_tokens": 8727079.0,
      "reward": 5.046875,
      "reward_std": 1.2593364715576172,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.6875,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.984375,
      "step": 322
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1026.0,
      "completions/max_terminated_length": 1026.0,
      "completions/mean_length": 478.21875,
      "completions/mean_terminated_length": 478.21875,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "epoch": 0.13156822810590632,
      "grad_norm": 0.09355194134232056,
      "kl": 0.1529541015625,
      "learning_rate": 1e-06,
      "loss": 0.0012,
      "num_tokens": 8757606.0,
      "reward": 4.1484375,
      "reward_std": 0.6420546174049377,
      "rewards/cargo_build_reward": 0.53125,
      "rewards/cargo_clippy_reward": 0.53125,
      "rewards/cargo_test_reward": 0.15625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7734375,
      "step": 323
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 483.0,
      "completions/max_terminated_length": 483.0,
      "completions/mean_length": 289.4375,
      "completions/mean_terminated_length": 289.4375,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "epoch": 0.1319755600814664,
      "grad_norm": 0.11677201059195703,
      "kl": 0.129638671875,
      "learning_rate": 1e-06,
      "loss": -0.0024,
      "num_tokens": 8782420.0,
      "reward": 4.953125,
      "reward_std": 0.6945017576217651,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.765625,
      "step": 324
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1687.0,
      "completions/max_terminated_length": 1687.0,
      "completions/mean_length": 491.75,
      "completions/mean_terminated_length": 491.75,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "epoch": 0.13238289205702647,
      "grad_norm": 0.15681318631924113,
      "kl": 0.09246826171875,
      "learning_rate": 1e-06,
      "loss": 0.0229,
      "num_tokens": 8813252.0,
      "reward": 4.15625,
      "reward_std": 0.9840061664581299,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.0625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.71875,
      "step": 325
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 828.0,
      "completions/max_terminated_length": 828.0,
      "completions/mean_length": 395.15625,
      "completions/mean_terminated_length": 395.15625,
      "completions/min_length": 209.0,
      "completions/min_terminated_length": 209.0,
      "epoch": 0.13279022403258656,
      "grad_norm": 0.0818899407346736,
      "kl": 0.06695556640625,
      "learning_rate": 1e-06,
      "loss": 0.0031,
      "num_tokens": 8841049.0,
      "reward": 4.5078125,
      "reward_std": 0.7758102416992188,
      "rewards/cargo_build_reward": 0.6875,
      "rewards/cargo_clippy_reward": 0.6875,
      "rewards/cargo_test_reward": 0.15625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8203125,
      "step": 326
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 498.0,
      "completions/max_terminated_length": 498.0,
      "completions/mean_length": 294.65625,
      "completions/mean_terminated_length": 294.65625,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "epoch": 0.13319755600814664,
      "grad_norm": 0.11106359578436034,
      "kl": 0.0850830078125,
      "learning_rate": 1e-06,
      "loss": 0.0034,
      "num_tokens": 8865686.0,
      "reward": 5.0078125,
      "reward_std": 0.8287466764450073,
      "rewards/cargo_build_reward": 0.9375,
      "rewards/cargo_clippy_reward": 0.9375,
      "rewards/cargo_test_reward": 0.21875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.6953125,
      "step": 327
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 671.0,
      "completions/max_terminated_length": 671.0,
      "completions/mean_length": 382.1875,
      "completions/mean_terminated_length": 382.1875,
      "completions/min_length": 213.0,
      "completions/min_terminated_length": 213.0,
      "epoch": 0.13360488798370673,
      "grad_norm": 0.2628942425661649,
      "kl": 0.08209228515625,
      "learning_rate": 1e-06,
      "loss": 0.0028,
      "num_tokens": 8893004.0,
      "reward": 5.0546875,
      "reward_std": 1.4586012363433838,
      "rewards/cargo_build_reward": 0.75,
      "rewards/cargo_clippy_reward": 0.75,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8671875,
      "step": 328
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1137.0,
      "completions/max_terminated_length": 1137.0,
      "completions/mean_length": 447.625,
      "completions/mean_terminated_length": 462.06451612903226,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 131.0,
      "epoch": 0.1340122199592668,
      "grad_norm": 0.14008020197789925,
      "kl": 0.04327392578125,
      "learning_rate": 1e-06,
      "loss": -0.0257,
      "num_tokens": 8926071.0,
      "reward": 5.3671875,
      "reward_std": 0.9594376087188721,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.4375,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.8671875,
      "step": 329
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 504.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 286.6875,
      "completions/mean_terminated_length": 286.6875,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "epoch": 0.13441955193482688,
      "grad_norm": 0.08983797578932191,
      "kl": 0.75439453125,
      "learning_rate": 1e-06,
      "loss": 0.0019,
      "num_tokens": 8950333.0,
      "reward": 5.859375,
      "reward_std": 0.7503848075866699,
      "rewards/cargo_build_reward": 0.9375,
      "rewards/cargo_clippy_reward": 0.9375,
      "rewards/cargo_test_reward": 0.5,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.984375,
      "step": 330
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 770.0,
      "completions/max_terminated_length": 770.0,
      "completions/mean_length": 433.5625,
      "completions/mean_terminated_length": 433.5625,
      "completions/min_length": 228.0,
      "completions/min_terminated_length": 228.0,
      "epoch": 0.13482688391038697,
      "grad_norm": 0.18433011032100935,
      "kl": 0.2506103515625,
      "learning_rate": 1e-06,
      "loss": 0.0015,
      "num_tokens": 8979783.0,
      "reward": 4.296875,
      "reward_std": 1.180513620376587,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.734375,
      "step": 331
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 515.0,
      "completions/max_terminated_length": 515.0,
      "completions/mean_length": 249.09375,
      "completions/mean_terminated_length": 249.74193548387098,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.13523421588594706,
      "grad_norm": 0.11900292517574411,
      "kl": 0.2587890625,
      "learning_rate": 1e-06,
      "loss": -0.0067,
      "num_tokens": 9003378.0,
      "reward": 5.8984375,
      "reward_std": 0.7815060615539551,
      "rewards/cargo_build_reward": 0.96875,
      "rewards/cargo_clippy_reward": 0.96875,
      "rewards/cargo_test_reward": 0.6875,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.9375,
      "rewards/test_block_count_reward": 0.9375,
      "rewards/tests_have_asserts_reward": 0.6953125,
      "step": 332
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 554.0,
      "completions/max_terminated_length": 554.0,
      "completions/mean_length": 280.5,
      "completions/mean_terminated_length": 280.5,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "epoch": 0.13564154786150712,
      "grad_norm": 0.15159153025833763,
      "kl": 0.3828125,
      "learning_rate": 1e-06,
      "loss": 0.0092,
      "num_tokens": 9028162.0,
      "reward": 5.1484375,
      "reward_std": 1.1171514987945557,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.40625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.5859375,
      "step": 333
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1117.0,
      "completions/max_terminated_length": 1117.0,
      "completions/mean_length": 367.28125,
      "completions/mean_terminated_length": 367.28125,
      "completions/min_length": 189.0,
      "completions/min_terminated_length": 189.0,
      "epoch": 0.1360488798370672,
      "grad_norm": 0.10697897189105052,
      "kl": 0.24005126953125,
      "learning_rate": 1e-06,
      "loss": 0.0066,
      "num_tokens": 9055331.0,
      "reward": 4.46875,
      "reward_std": 1.0055537223815918,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.1875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.78125,
      "step": 334
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 629.0,
      "completions/max_terminated_length": 629.0,
      "completions/mean_length": 391.46875,
      "completions/mean_terminated_length": 391.46875,
      "completions/min_length": 230.0,
      "completions/min_terminated_length": 230.0,
      "epoch": 0.1364562118126273,
      "grad_norm": 0.1396713879603593,
      "kl": 0.1015625,
      "learning_rate": 1e-06,
      "loss": -0.0046,
      "num_tokens": 9083442.0,
      "reward": 4.421875,
      "reward_std": 1.1342871189117432,
      "rewards/cargo_build_reward": 0.5625,
      "rewards/cargo_clippy_reward": 0.5625,
      "rewards/cargo_test_reward": 0.1875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.921875,
      "step": 335
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 969.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 435.15625,
      "completions/mean_terminated_length": 435.15625,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "epoch": 0.13686354378818738,
      "grad_norm": 0.17972755364640292,
      "kl": 0.1123046875,
      "learning_rate": 1e-06,
      "loss": -0.0033,
      "num_tokens": 9112679.0,
      "reward": 5.0390625,
      "reward_std": 1.356706142425537,
      "rewards/cargo_build_reward": 0.75,
      "rewards/cargo_clippy_reward": 0.75,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8515625,
      "step": 336
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 796.0,
      "completions/max_terminated_length": 796.0,
      "completions/mean_length": 366.65625,
      "completions/mean_terminated_length": 366.65625,
      "completions/min_length": 232.0,
      "completions/min_terminated_length": 232.0,
      "epoch": 0.13727087576374744,
      "grad_norm": 0.11405749313429404,
      "kl": 0.12042236328125,
      "learning_rate": 1e-06,
      "loss": 0.0032,
      "num_tokens": 9139172.0,
      "reward": 5.796875,
      "reward_std": 1.0548584461212158,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.5,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.984375,
      "step": 337
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1255.0,
      "completions/max_terminated_length": 1255.0,
      "completions/mean_length": 461.875,
      "completions/mean_terminated_length": 461.875,
      "completions/min_length": 238.0,
      "completions/min_terminated_length": 238.0,
      "epoch": 0.13767820773930753,
      "grad_norm": 0.13903505250638792,
      "kl": 0.2977294921875,
      "learning_rate": 1e-06,
      "loss": 0.0029,
      "num_tokens": 9169560.0,
      "reward": 3.6171875,
      "reward_std": 0.8829737305641174,
      "rewards/cargo_build_reward": 0.34375,
      "rewards/cargo_clippy_reward": 0.28125,
      "rewards/cargo_test_reward": 0.03125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9296875,
      "step": 338
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 742.0,
      "completions/max_terminated_length": 742.0,
      "completions/mean_length": 355.875,
      "completions/mean_terminated_length": 355.875,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "epoch": 0.13808553971486762,
      "grad_norm": 0.16386479710192556,
      "kl": 0.55517578125,
      "learning_rate": 1e-06,
      "loss": -0.008,
      "num_tokens": 9195884.0,
      "reward": 5.4453125,
      "reward_std": 1.189713954925537,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.53125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.90625,
      "rewards/tests_have_asserts_reward": 0.7421875,
      "step": 339
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 759.0,
      "completions/max_terminated_length": 759.0,
      "completions/mean_length": 429.625,
      "completions/mean_terminated_length": 429.625,
      "completions/min_length": 247.0,
      "completions/min_terminated_length": 247.0,
      "epoch": 0.1384928716904277,
      "grad_norm": 0.16134912197389864,
      "kl": 0.1817626953125,
      "learning_rate": 1e-06,
      "loss": 0.0069,
      "num_tokens": 9223960.0,
      "reward": 5.1484375,
      "reward_std": 1.121705412864685,
      "rewards/cargo_build_reward": 0.78125,
      "rewards/cargo_clippy_reward": 0.78125,
      "rewards/cargo_test_reward": 0.3125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9609375,
      "step": 340
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 866.0,
      "completions/max_terminated_length": 866.0,
      "completions/mean_length": 369.125,
      "completions/mean_terminated_length": 369.125,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "epoch": 0.1389002036659878,
      "grad_norm": 0.14456161106416965,
      "kl": 0.25439453125,
      "learning_rate": 1e-06,
      "loss": 0.0002,
      "num_tokens": 9251340.0,
      "reward": 5.09375,
      "reward_std": 1.045795202255249,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.71875,
      "step": 341
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 378.0,
      "completions/max_terminated_length": 378.0,
      "completions/mean_length": 282.96875,
      "completions/mean_terminated_length": 282.96875,
      "completions/min_length": 211.0,
      "completions/min_terminated_length": 211.0,
      "epoch": 0.13930753564154785,
      "grad_norm": 0.07830169143624639,
      "kl": 0.06414794921875,
      "learning_rate": 1e-06,
      "loss": 0.002,
      "num_tokens": 9275587.0,
      "reward": 5.7421875,
      "reward_std": 0.7753755450248718,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.53125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9296875,
      "step": 342
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 597.0,
      "completions/max_terminated_length": 597.0,
      "completions/mean_length": 324.46875,
      "completions/mean_terminated_length": 324.46875,
      "completions/min_length": 193.0,
      "completions/min_terminated_length": 193.0,
      "epoch": 0.13971486761710794,
      "grad_norm": 0.18460081439328185,
      "kl": 2.6995849609375,
      "learning_rate": 1e-06,
      "loss": 0.0066,
      "num_tokens": 9300674.0,
      "reward": 4.9140625,
      "reward_std": 1.3185405731201172,
      "rewards/cargo_build_reward": 0.6875,
      "rewards/cargo_clippy_reward": 0.6875,
      "rewards/cargo_test_reward": 0.40625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7265625,
      "step": 343
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1234.0,
      "completions/max_terminated_length": 1234.0,
      "completions/mean_length": 355.90625,
      "completions/mean_terminated_length": 355.90625,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "epoch": 0.14012219959266803,
      "grad_norm": 0.09901975986434068,
      "kl": 0.2659912109375,
      "learning_rate": 1e-06,
      "loss": -0.0034,
      "num_tokens": 9326943.0,
      "reward": 5.6484375,
      "reward_std": 0.6417502164840698,
      "rewards/cargo_build_reward": 0.9375,
      "rewards/cargo_clippy_reward": 0.9375,
      "rewards/cargo_test_reward": 0.46875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8359375,
      "step": 344
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 740.0,
      "completions/max_terminated_length": 740.0,
      "completions/mean_length": 366.71875,
      "completions/mean_terminated_length": 366.71875,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "epoch": 0.14052953156822812,
      "grad_norm": 0.09331007976098271,
      "kl": 0.1285400390625,
      "learning_rate": 1e-06,
      "loss": -0.0011,
      "num_tokens": 9354206.0,
      "reward": 5.6171875,
      "reward_std": 0.5458313226699829,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.4375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9296875,
      "step": 345
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 771.0,
      "completions/max_terminated_length": 771.0,
      "completions/mean_length": 440.0625,
      "completions/mean_terminated_length": 440.0625,
      "completions/min_length": 249.0,
      "completions/min_terminated_length": 249.0,
      "epoch": 0.14093686354378818,
      "grad_norm": 0.14630331349890233,
      "kl": 0.346435546875,
      "learning_rate": 1e-06,
      "loss": 0.0068,
      "num_tokens": 9383000.0,
      "reward": 4.734375,
      "reward_std": 1.0637853145599365,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.859375,
      "step": 346
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 619.0,
      "completions/max_terminated_length": 619.0,
      "completions/mean_length": 381.65625,
      "completions/mean_terminated_length": 381.65625,
      "completions/min_length": 244.0,
      "completions/min_terminated_length": 244.0,
      "epoch": 0.14134419551934826,
      "grad_norm": 0.17331226749699366,
      "kl": 0.6561279296875,
      "learning_rate": 1e-06,
      "loss": 0.0043,
      "num_tokens": 9410541.0,
      "reward": 4.4921875,
      "reward_std": 1.2115974426269531,
      "rewards/cargo_build_reward": 0.78125,
      "rewards/cargo_clippy_reward": 0.78125,
      "rewards/cargo_test_reward": 0.15625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.6171875,
      "step": 347
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 511.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 323.21875,
      "completions/mean_terminated_length": 322.2258064516129,
      "completions/min_length": 227.0,
      "completions/min_terminated_length": 227.0,
      "epoch": 0.14175152749490835,
      "grad_norm": 0.16129291860470463,
      "kl": 0.4150390625,
      "learning_rate": 1e-06,
      "loss": -0.0006,
      "num_tokens": 9435276.0,
      "reward": 4.765625,
      "reward_std": 1.3257715702056885,
      "rewards/cargo_build_reward": 0.75,
      "rewards/cargo_clippy_reward": 0.75,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.703125,
      "step": 348
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 625.0,
      "completions/max_terminated_length": 625.0,
      "completions/mean_length": 392.125,
      "completions/mean_terminated_length": 392.125,
      "completions/min_length": 239.0,
      "completions/min_terminated_length": 239.0,
      "epoch": 0.14215885947046844,
      "grad_norm": 0.24621323411601634,
      "kl": 0.520263671875,
      "learning_rate": 1e-06,
      "loss": 0.0002,
      "num_tokens": 9462464.0,
      "reward": 5.3671875,
      "reward_std": 1.42042875289917,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.40625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.8828125,
      "step": 349
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1103.0,
      "completions/max_terminated_length": 1103.0,
      "completions/mean_length": 400.78125,
      "completions/mean_terminated_length": 400.78125,
      "completions/min_length": 209.0,
      "completions/min_terminated_length": 209.0,
      "epoch": 0.1425661914460285,
      "grad_norm": 0.129682090022344,
      "kl": 0.437744140625,
      "learning_rate": 1e-06,
      "loss": 0.0025,
      "num_tokens": 9490625.0,
      "reward": 5.5859375,
      "reward_std": 0.9094717502593994,
      "rewards/cargo_build_reward": 0.78125,
      "rewards/cargo_clippy_reward": 0.78125,
      "rewards/cargo_test_reward": 0.59375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8359375,
      "step": 350
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 684.0,
      "completions/max_terminated_length": 684.0,
      "completions/mean_length": 352.46875,
      "completions/mean_terminated_length": 352.46875,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "epoch": 0.1429735234215886,
      "grad_norm": 0.09688786730480363,
      "kl": 0.23370361328125,
      "learning_rate": 1e-06,
      "loss": -0.0006,
      "num_tokens": 9516376.0,
      "reward": 5.515625,
      "reward_std": 0.7535701990127563,
      "rewards/cargo_build_reward": 0.9375,
      "rewards/cargo_clippy_reward": 0.9375,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.953125,
      "step": 351
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 538.0,
      "completions/max_terminated_length": 538.0,
      "completions/mean_length": 292.625,
      "completions/mean_terminated_length": 292.625,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "epoch": 0.14338085539714868,
      "grad_norm": 0.08622417964860002,
      "kl": 0.849609375,
      "learning_rate": 1e-06,
      "loss": 0.0025,
      "num_tokens": 9541268.0,
      "reward": 5.8515625,
      "reward_std": 0.37146395444869995,
      "rewards/cargo_build_reward": 0.96875,
      "rewards/cargo_clippy_reward": 0.96875,
      "rewards/cargo_test_reward": 0.5625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7890625,
      "step": 352
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 641.0,
      "completions/max_terminated_length": 641.0,
      "completions/mean_length": 338.96875,
      "completions/mean_terminated_length": 338.96875,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "epoch": 0.14378818737270876,
      "grad_norm": 0.14260709357338208,
      "kl": 0.534912109375,
      "learning_rate": 1e-06,
      "loss": 0.0002,
      "num_tokens": 9567107.0,
      "reward": 5.1875,
      "reward_std": 1.0232089757919312,
      "rewards/cargo_build_reward": 0.71875,
      "rewards/cargo_clippy_reward": 0.71875,
      "rewards/cargo_test_reward": 0.46875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8125,
      "step": 353
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 652.0,
      "completions/max_terminated_length": 652.0,
      "completions/mean_length": 275.34375,
      "completions/mean_terminated_length": 275.34375,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "epoch": 0.14419551934826885,
      "grad_norm": 0.10357169198308107,
      "kl": 0.74462890625,
      "learning_rate": 1e-06,
      "loss": -0.0022,
      "num_tokens": 9591462.0,
      "reward": 5.9296875,
      "reward_std": 0.9630800485610962,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8671875,
      "step": 354
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 578.0,
      "completions/max_terminated_length": 578.0,
      "completions/mean_length": 309.15625,
      "completions/mean_terminated_length": 309.15625,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "epoch": 0.1446028513238289,
      "grad_norm": 0.09464501278494115,
      "kl": 0.219482421875,
      "learning_rate": 1e-06,
      "loss": -0.0039,
      "num_tokens": 9616315.0,
      "reward": 5.6171875,
      "reward_std": 0.6173626184463501,
      "rewards/cargo_build_reward": 1.0,
      "rewards/cargo_clippy_reward": 1.0,
      "rewards/cargo_test_reward": 0.40625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8046875,
      "step": 355
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 982.0,
      "completions/max_terminated_length": 982.0,
      "completions/mean_length": 376.6875,
      "completions/mean_terminated_length": 376.6875,
      "completions/min_length": 186.0,
      "completions/min_terminated_length": 186.0,
      "epoch": 0.145010183299389,
      "grad_norm": 1.8299172629926406,
      "kl": 13.1044921875,
      "learning_rate": 1e-06,
      "loss": 0.013,
      "num_tokens": 9643193.0,
      "reward": 5.875,
      "reward_std": 1.1920604705810547,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9375,
      "step": 356
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 387.0,
      "completions/max_terminated_length": 387.0,
      "completions/mean_length": 221.25,
      "completions/mean_terminated_length": 221.25,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "epoch": 0.1454175152749491,
      "grad_norm": 0.062390222221837544,
      "kl": 0.460693359375,
      "learning_rate": 1e-06,
      "loss": -0.0007,
      "num_tokens": 9664665.0,
      "reward": 6.1640625,
      "reward_std": 0.5787454843521118,
      "rewards/cargo_build_reward": 1.0,
      "rewards/cargo_clippy_reward": 1.0,
      "rewards/cargo_test_reward": 0.65625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8515625,
      "step": 357
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 628.0,
      "completions/max_terminated_length": 628.0,
      "completions/mean_length": 379.125,
      "completions/mean_terminated_length": 379.125,
      "completions/min_length": 239.0,
      "completions/min_terminated_length": 239.0,
      "epoch": 0.14582484725050918,
      "grad_norm": 0.19469566990993933,
      "kl": 1.071533203125,
      "learning_rate": 1e-06,
      "loss": 0.0034,
      "num_tokens": 9692541.0,
      "reward": 4.609375,
      "reward_std": 1.0437216758728027,
      "rewards/cargo_build_reward": 0.78125,
      "rewards/cargo_clippy_reward": 0.78125,
      "rewards/cargo_test_reward": 0.125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.796875,
      "step": 358
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 522.0,
      "completions/max_terminated_length": 522.0,
      "completions/mean_length": 336.625,
      "completions/mean_terminated_length": 336.625,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "epoch": 0.14623217922606924,
      "grad_norm": 0.11000479251888631,
      "kl": 0.231689453125,
      "learning_rate": 1e-06,
      "loss": 0.0003,
      "num_tokens": 9718561.0,
      "reward": 5.0625,
      "reward_std": 0.8462489247322083,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.15625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9375,
      "step": 359
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1347.0,
      "completions/max_terminated_length": 1347.0,
      "completions/mean_length": 366.6875,
      "completions/mean_terminated_length": 366.6875,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "epoch": 0.14663951120162932,
      "grad_norm": 0.2937439499832218,
      "kl": 0.7568359375,
      "learning_rate": 1e-06,
      "loss": 0.0305,
      "num_tokens": 9746159.0,
      "reward": 4.515625,
      "reward_std": 1.369112253189087,
      "rewards/cargo_build_reward": 0.6875,
      "rewards/cargo_clippy_reward": 0.6875,
      "rewards/cargo_test_reward": 0.3125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.515625,
      "step": 360
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 942.0,
      "completions/max_terminated_length": 942.0,
      "completions/mean_length": 368.15625,
      "completions/mean_terminated_length": 368.15625,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "epoch": 0.1470468431771894,
      "grad_norm": 0.13892517468110852,
      "kl": 0.875244140625,
      "learning_rate": 1e-06,
      "loss": 0.0043,
      "num_tokens": 9772972.0,
      "reward": 5.078125,
      "reward_std": 0.7367126941680908,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.734375,
      "step": 361
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 394.0,
      "completions/max_terminated_length": 394.0,
      "completions/mean_length": 258.4375,
      "completions/mean_terminated_length": 258.4375,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "epoch": 0.1474541751527495,
      "grad_norm": 0.08422152222290788,
      "kl": 0.91064453125,
      "learning_rate": 1e-06,
      "loss": 0.0011,
      "num_tokens": 9796242.0,
      "reward": 6.015625,
      "reward_std": 0.6708080172538757,
      "rewards/cargo_build_reward": 0.96875,
      "rewards/cargo_clippy_reward": 0.96875,
      "rewards/cargo_test_reward": 0.5625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.953125,
      "step": 362
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 611.0,
      "completions/max_terminated_length": 611.0,
      "completions/mean_length": 313.34375,
      "completions/mean_terminated_length": 313.34375,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "epoch": 0.14786150712830956,
      "grad_norm": 0.22427196336704966,
      "kl": 3.2813720703125,
      "learning_rate": 1e-06,
      "loss": 0.0005,
      "num_tokens": 9822061.0,
      "reward": 5.6171875,
      "reward_std": 0.7913834452629089,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.5625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7421875,
      "step": 363
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1585.0,
      "completions/max_terminated_length": 1585.0,
      "completions/mean_length": 360.5625,
      "completions/mean_terminated_length": 360.5625,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "epoch": 0.14826883910386965,
      "grad_norm": 0.15382775868441903,
      "kl": 0.376708984375,
      "learning_rate": 1e-06,
      "loss": 0.0121,
      "num_tokens": 9848535.0,
      "reward": 5.1171875,
      "reward_std": 1.0124129056930542,
      "rewards/cargo_build_reward": 0.78125,
      "rewards/cargo_clippy_reward": 0.78125,
      "rewards/cargo_test_reward": 0.4375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.6796875,
      "step": 364
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 721.0,
      "completions/max_terminated_length": 721.0,
      "completions/mean_length": 384.9375,
      "completions/mean_terminated_length": 397.35483870967744,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 185.0,
      "epoch": 0.14867617107942974,
      "grad_norm": 0.18905957917264707,
      "kl": 0.3074951171875,
      "learning_rate": 1e-06,
      "loss": -0.016,
      "num_tokens": 9879997.0,
      "reward": 5.2421875,
      "reward_std": 1.2076013088226318,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.8671875,
      "step": 365
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 942.0,
      "completions/max_terminated_length": 942.0,
      "completions/mean_length": 335.15625,
      "completions/mean_terminated_length": 335.15625,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "epoch": 0.14908350305498982,
      "grad_norm": 0.23612644375747585,
      "kl": 0.3916015625,
      "learning_rate": 1e-06,
      "loss": 0.0105,
      "num_tokens": 9905938.0,
      "reward": 5.9140625,
      "reward_std": 0.9861429929733276,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.65625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7890625,
      "step": 366
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 739.0,
      "completions/max_terminated_length": 739.0,
      "completions/mean_length": 317.65625,
      "completions/mean_terminated_length": 317.65625,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "epoch": 0.14949083503054988,
      "grad_norm": 0.1375577599975736,
      "kl": 0.35107421875,
      "learning_rate": 1e-06,
      "loss": 0.0015,
      "num_tokens": 9930687.0,
      "reward": 5.6953125,
      "reward_std": 0.8999901413917542,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.59375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7578125,
      "step": 367
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 537.0,
      "completions/max_terminated_length": 537.0,
      "completions/mean_length": 376.75,
      "completions/mean_terminated_length": 376.75,
      "completions/min_length": 254.0,
      "completions/min_terminated_length": 254.0,
      "epoch": 0.14989816700610997,
      "grad_norm": 0.12707335770193073,
      "kl": 0.291748046875,
      "learning_rate": 1e-06,
      "loss": 0.0018,
      "num_tokens": 9958063.0,
      "reward": 4.9453125,
      "reward_std": 0.9925188422203064,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.15625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9140625,
      "step": 368
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 821.0,
      "completions/max_terminated_length": 821.0,
      "completions/mean_length": 409.84375,
      "completions/mean_terminated_length": 409.84375,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 205.0,
      "epoch": 0.15030549898167006,
      "grad_norm": 334.0803513591208,
      "kl": 2434.5240478515625,
      "learning_rate": 1e-06,
      "loss": 0.2369,
      "num_tokens": 9986194.0,
      "reward": 5.4296875,
      "reward_std": 0.9298094511032104,
      "rewards/cargo_build_reward": 0.9375,
      "rewards/cargo_clippy_reward": 0.9375,
      "rewards/cargo_test_reward": 0.3125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9296875,
      "step": 369
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 579.0,
      "completions/max_terminated_length": 579.0,
      "completions/mean_length": 350.125,
      "completions/mean_terminated_length": 350.125,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "epoch": 0.15071283095723015,
      "grad_norm": 0.13930721173294916,
      "kl": 0.2474365234375,
      "learning_rate": 1e-06,
      "loss": -0.0026,
      "num_tokens": 10012830.0,
      "reward": 4.625,
      "reward_std": 1.1380959749221802,
      "rewards/cargo_build_reward": 0.625,
      "rewards/cargo_clippy_reward": 0.625,
      "rewards/cargo_test_reward": 0.21875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9375,
      "step": 370
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 695.0,
      "completions/max_terminated_length": 695.0,
      "completions/mean_length": 310.75,
      "completions/mean_terminated_length": 310.75,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "epoch": 0.15112016293279024,
      "grad_norm": 0.1259032346915019,
      "kl": 0.7496337890625,
      "learning_rate": 1e-06,
      "loss": 0.0053,
      "num_tokens": 10036614.0,
      "reward": 5.59375,
      "reward_std": 0.8330329060554504,
      "rewards/cargo_build_reward": 0.9375,
      "rewards/cargo_clippy_reward": 0.9375,
      "rewards/cargo_test_reward": 0.40625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.90625,
      "step": 371
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 690.0,
      "completions/max_terminated_length": 690.0,
      "completions/mean_length": 419.8125,
      "completions/mean_terminated_length": 419.8125,
      "completions/min_length": 280.0,
      "completions/min_terminated_length": 280.0,
      "epoch": 0.1515274949083503,
      "grad_norm": 0.15775251764914489,
      "kl": 0.2373046875,
      "learning_rate": 1e-06,
      "loss": -0.0064,
      "num_tokens": 10065488.0,
      "reward": 4.453125,
      "reward_std": 1.140798807144165,
      "rewards/cargo_build_reward": 0.78125,
      "rewards/cargo_clippy_reward": 0.78125,
      "rewards/cargo_test_reward": 0.125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.640625,
      "step": 372
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 617.0,
      "completions/max_terminated_length": 617.0,
      "completions/mean_length": 369.9375,
      "completions/mean_terminated_length": 369.9375,
      "completions/min_length": 211.0,
      "completions/min_terminated_length": 211.0,
      "epoch": 0.15193482688391038,
      "grad_norm": 0.18144745491964562,
      "kl": 0.1636962890625,
      "learning_rate": 1e-06,
      "loss": 0.0042,
      "num_tokens": 10092462.0,
      "reward": 5.296875,
      "reward_std": 1.3446197509765625,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.40625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.796875,
      "step": 373
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 651.0,
      "completions/max_terminated_length": 651.0,
      "completions/mean_length": 351.40625,
      "completions/mean_terminated_length": 351.40625,
      "completions/min_length": 239.0,
      "completions/min_terminated_length": 239.0,
      "epoch": 0.15234215885947047,
      "grad_norm": 0.14296036170269852,
      "kl": 0.565185546875,
      "learning_rate": 1e-06,
      "loss": 0.0021,
      "num_tokens": 10118707.0,
      "reward": 5.5703125,
      "reward_std": 0.8960230350494385,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.5,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8828125,
      "step": 374
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 852.0,
      "completions/max_terminated_length": 852.0,
      "completions/mean_length": 321.8125,
      "completions/mean_terminated_length": 321.8125,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "epoch": 0.15274949083503056,
      "grad_norm": 0.0571801373583109,
      "kl": 0.10498046875,
      "learning_rate": 1e-06,
      "loss": -0.003,
      "num_tokens": 10143893.0,
      "reward": 5.7109375,
      "reward_std": 0.4177277386188507,
      "rewards/cargo_build_reward": 0.96875,
      "rewards/cargo_clippy_reward": 0.96875,
      "rewards/cargo_test_reward": 0.46875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8359375,
      "step": 375
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 854.0,
      "completions/max_terminated_length": 854.0,
      "completions/mean_length": 464.15625,
      "completions/mean_terminated_length": 464.15625,
      "completions/min_length": 263.0,
      "completions/min_terminated_length": 263.0,
      "epoch": 0.15315682281059062,
      "grad_norm": 0.1904269614370811,
      "kl": 0.350341796875,
      "learning_rate": 1e-06,
      "loss": 0.0054,
      "num_tokens": 10173354.0,
      "reward": 3.7421875,
      "reward_std": 1.1803107261657715,
      "rewards/cargo_build_reward": 0.46875,
      "rewards/cargo_clippy_reward": 0.46875,
      "rewards/cargo_test_reward": 0.03125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.7578125,
      "step": 376
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 649.0,
      "completions/max_terminated_length": 649.0,
      "completions/mean_length": 350.9375,
      "completions/mean_terminated_length": 350.9375,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "epoch": 0.1535641547861507,
      "grad_norm": 0.16806719214370533,
      "kl": 0.1827392578125,
      "learning_rate": 1e-06,
      "loss": 0.0016,
      "num_tokens": 10200480.0,
      "reward": 4.890625,
      "reward_std": 1.2776925563812256,
      "rewards/cargo_build_reward": 0.78125,
      "rewards/cargo_clippy_reward": 0.78125,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.765625,
      "step": 377
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 602.0,
      "completions/max_terminated_length": 602.0,
      "completions/mean_length": 339.5,
      "completions/mean_terminated_length": 339.5,
      "completions/min_length": 232.0,
      "completions/min_terminated_length": 232.0,
      "epoch": 0.1539714867617108,
      "grad_norm": 0.13956120189042864,
      "kl": 0.251953125,
      "learning_rate": 1e-06,
      "loss": 0.002,
      "num_tokens": 10226192.0,
      "reward": 4.7265625,
      "reward_std": 0.7777705192565918,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.0625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9140625,
      "step": 378
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 658.0,
      "completions/max_terminated_length": 658.0,
      "completions/mean_length": 299.28125,
      "completions/mean_terminated_length": 308.93548387096774,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 84.0,
      "epoch": 0.15437881873727088,
      "grad_norm": 0.11156327089525053,
      "kl": 0.432373046875,
      "learning_rate": 1e-06,
      "loss": -0.0066,
      "num_tokens": 10254345.0,
      "reward": 4.046875,
      "reward_std": 0.6096374988555908,
      "rewards/cargo_build_reward": 0.625,
      "rewards/cargo_clippy_reward": 0.625,
      "rewards/cargo_test_reward": 0.25,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.359375,
      "step": 379
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 614.0,
      "completions/max_terminated_length": 614.0,
      "completions/mean_length": 311.875,
      "completions/mean_terminated_length": 311.875,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "epoch": 0.15478615071283094,
      "grad_norm": 0.1238239866670562,
      "kl": 0.1663818359375,
      "learning_rate": 1e-06,
      "loss": 0.0046,
      "num_tokens": 10279469.0,
      "reward": 4.8671875,
      "reward_std": 0.8618788123130798,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.4375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.6796875,
      "step": 380
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 714.0,
      "completions/max_terminated_length": 714.0,
      "completions/mean_length": 366.9375,
      "completions/mean_terminated_length": 366.9375,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "epoch": 0.15519348268839103,
      "grad_norm": 0.1566580288929037,
      "kl": 0.1141357421875,
      "learning_rate": 1e-06,
      "loss": -0.0001,
      "num_tokens": 10306251.0,
      "reward": 5.0546875,
      "reward_std": 1.1590633392333984,
      "rewards/cargo_build_reward": 0.75,
      "rewards/cargo_clippy_reward": 0.75,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8046875,
      "step": 381
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 424.0,
      "completions/max_terminated_length": 424.0,
      "completions/mean_length": 265.15625,
      "completions/mean_terminated_length": 265.15625,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "epoch": 0.15560081466395112,
      "grad_norm": 0.09000774004267857,
      "kl": 0.17578125,
      "learning_rate": 1e-06,
      "loss": 0.0041,
      "num_tokens": 10329976.0,
      "reward": 5.6953125,
      "reward_std": 0.6957299709320068,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.5625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8828125,
      "step": 382
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 731.0,
      "completions/max_terminated_length": 731.0,
      "completions/mean_length": 313.34375,
      "completions/mean_terminated_length": 313.34375,
      "completions/min_length": 195.0,
      "completions/min_terminated_length": 195.0,
      "epoch": 0.1560081466395112,
      "grad_norm": 0.11616919463486235,
      "kl": 0.1746826171875,
      "learning_rate": 1e-06,
      "loss": 0.0007,
      "num_tokens": 10354731.0,
      "reward": 5.2421875,
      "reward_std": 0.9693155288696289,
      "rewards/cargo_build_reward": 0.75,
      "rewards/cargo_clippy_reward": 0.75,
      "rewards/cargo_test_reward": 0.4375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8671875,
      "step": 383
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 390.0,
      "completions/max_terminated_length": 390.0,
      "completions/mean_length": 275.625,
      "completions/mean_terminated_length": 275.625,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "epoch": 0.1564154786150713,
      "grad_norm": 0.15165274236481607,
      "kl": 0.12646484375,
      "learning_rate": 1e-06,
      "loss": 0.003,
      "num_tokens": 10379231.0,
      "reward": 6.0390625,
      "reward_std": 1.0088050365447998,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.65625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9140625,
      "step": 384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 529.0,
      "completions/max_terminated_length": 529.0,
      "completions/mean_length": 271.1875,
      "completions/mean_terminated_length": 271.1875,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "epoch": 0.15682281059063136,
      "grad_norm": 0.15015585048817562,
      "kl": 0.1282958984375,
      "learning_rate": 1e-06,
      "loss": -0.0006,
      "num_tokens": 10401717.0,
      "reward": 4.890625,
      "reward_std": 1.097530722618103,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.21875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.828125,
      "step": 385
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 399.0,
      "completions/max_terminated_length": 399.0,
      "completions/mean_length": 248.375,
      "completions/mean_terminated_length": 248.375,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "epoch": 0.15723014256619144,
      "grad_norm": 0.09102946894059866,
      "kl": 0.208740234375,
      "learning_rate": 1e-06,
      "loss": -0.0005,
      "num_tokens": 10423929.0,
      "reward": 5.921875,
      "reward_std": 0.9516997933387756,
      "rewards/cargo_build_reward": 1.0,
      "rewards/cargo_clippy_reward": 1.0,
      "rewards/cargo_test_reward": 0.53125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.859375,
      "step": 386
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 460.0,
      "completions/max_terminated_length": 460.0,
      "completions/mean_length": 285.34375,
      "completions/mean_terminated_length": 285.34375,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "epoch": 0.15763747454175153,
      "grad_norm": 0.10176893664604732,
      "kl": 0.1142578125,
      "learning_rate": 1e-06,
      "loss": -0.0017,
      "num_tokens": 10448044.0,
      "reward": 5.53125,
      "reward_std": 0.8825888633728027,
      "rewards/cargo_build_reward": 0.9375,
      "rewards/cargo_clippy_reward": 0.9375,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.96875,
      "step": 387
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 643.0,
      "completions/max_terminated_length": 643.0,
      "completions/mean_length": 358.09375,
      "completions/mean_terminated_length": 358.09375,
      "completions/min_length": 226.0,
      "completions/min_terminated_length": 226.0,
      "epoch": 0.15804480651731162,
      "grad_norm": 0.09486203025489602,
      "kl": 0.2197265625,
      "learning_rate": 1e-06,
      "loss": 0.0018,
      "num_tokens": 10475279.0,
      "reward": 5.3515625,
      "reward_std": 0.7889389395713806,
      "rewards/cargo_build_reward": 0.9375,
      "rewards/cargo_clippy_reward": 0.9375,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7265625,
      "step": 388
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 710.0,
      "completions/max_terminated_length": 710.0,
      "completions/mean_length": 285.71875,
      "completions/mean_terminated_length": 285.71875,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "epoch": 0.15845213849287168,
      "grad_norm": 0.11315800904320061,
      "kl": 0.1761474609375,
      "learning_rate": 1e-06,
      "loss": -0.0062,
      "num_tokens": 10499462.0,
      "reward": 5.875,
      "reward_std": 0.8068819046020508,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.65625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.75,
      "step": 389
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 415.0,
      "completions/max_terminated_length": 415.0,
      "completions/mean_length": 310.53125,
      "completions/mean_terminated_length": 310.53125,
      "completions/min_length": 192.0,
      "completions/min_terminated_length": 192.0,
      "epoch": 0.15885947046843177,
      "grad_norm": 0.116427112693145,
      "kl": 0.4306640625,
      "learning_rate": 1e-06,
      "loss": 0.0038,
      "num_tokens": 10524895.0,
      "reward": 5.0390625,
      "reward_std": 0.9798887968063354,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.78125,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8828125,
      "step": 390
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 801.0,
      "completions/max_terminated_length": 801.0,
      "completions/mean_length": 395.21875,
      "completions/mean_terminated_length": 395.21875,
      "completions/min_length": 184.0,
      "completions/min_terminated_length": 184.0,
      "epoch": 0.15926680244399186,
      "grad_norm": 0.10921671461750218,
      "kl": 0.26708984375,
      "learning_rate": 1e-06,
      "loss": 0.0055,
      "num_tokens": 10552454.0,
      "reward": 5.21875,
      "reward_std": 0.7757423520088196,
      "rewards/cargo_build_reward": 0.9375,
      "rewards/cargo_clippy_reward": 0.9375,
      "rewards/cargo_test_reward": 0.1875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.96875,
      "step": 391
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 660.0,
      "completions/max_terminated_length": 660.0,
      "completions/mean_length": 361.9375,
      "completions/mean_terminated_length": 361.9375,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "epoch": 0.15967413441955194,
      "grad_norm": 0.11276544935702329,
      "kl": 0.4576416015625,
      "learning_rate": 1e-06,
      "loss": -0.0028,
      "num_tokens": 10579828.0,
      "reward": 5.2265625,
      "reward_std": 0.9451630115509033,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8515625,
      "step": 392
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 499.0,
      "completions/max_terminated_length": 499.0,
      "completions/mean_length": 350.40625,
      "completions/mean_terminated_length": 350.40625,
      "completions/min_length": 229.0,
      "completions/min_terminated_length": 229.0,
      "epoch": 0.160081466395112,
      "grad_norm": 0.11822273106327352,
      "kl": 0.5804443359375,
      "learning_rate": 1e-06,
      "loss": -0.0028,
      "num_tokens": 10605409.0,
      "reward": 4.6953125,
      "reward_std": 0.9203661680221558,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.21875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9453125,
      "step": 393
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 3345.0,
      "completions/max_terminated_length": 633.0,
      "completions/mean_length": 357.40625,
      "completions/mean_terminated_length": 261.03225806451616,
      "completions/min_length": 181.0,
      "completions/min_terminated_length": 181.0,
      "epoch": 0.1604887983706721,
      "grad_norm": 0.10441250245049562,
      "kl": 0.18048095703125,
      "learning_rate": 1e-06,
      "loss": 0.1009,
      "num_tokens": 10632230.0,
      "reward": 6.1875,
      "reward_std": 0.6983473300933838,
      "rewards/cargo_build_reward": 0.96875,
      "rewards/cargo_clippy_reward": 0.96875,
      "rewards/cargo_test_reward": 0.75,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.8125,
      "step": 394
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 497.0,
      "completions/max_terminated_length": 497.0,
      "completions/mean_length": 237.8125,
      "completions/mean_terminated_length": 237.8125,
      "completions/min_length": 64.0,
      "completions/min_terminated_length": 64.0,
      "epoch": 0.16089613034623218,
      "grad_norm": 0.07926754950109575,
      "kl": 0.9730224609375,
      "learning_rate": 1e-06,
      "loss": -0.0021,
      "num_tokens": 10654432.0,
      "reward": 6.625,
      "reward_std": 0.3837546706199646,
      "rewards/cargo_build_reward": 1.0,
      "rewards/cargo_clippy_reward": 1.0,
      "rewards/cargo_test_reward": 0.9375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.75,
      "step": 395
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2061.0,
      "completions/max_terminated_length": 2061.0,
      "completions/mean_length": 370.84375,
      "completions/mean_terminated_length": 370.84375,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "epoch": 0.16130346232179227,
      "grad_norm": 0.1073240116584982,
      "kl": 0.2197265625,
      "learning_rate": 1e-06,
      "loss": -0.0018,
      "num_tokens": 10680451.0,
      "reward": 5.484375,
      "reward_std": 0.9157319068908691,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.921875,
      "step": 396
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 869.0,
      "completions/max_terminated_length": 869.0,
      "completions/mean_length": 383.375,
      "completions/mean_terminated_length": 383.375,
      "completions/min_length": 232.0,
      "completions/min_terminated_length": 232.0,
      "epoch": 0.16171079429735236,
      "grad_norm": 0.09673030237234143,
      "kl": 0.6353759765625,
      "learning_rate": 1e-06,
      "loss": 0.0033,
      "num_tokens": 10707399.0,
      "reward": 5.1328125,
      "reward_std": 0.788161039352417,
      "rewards/cargo_build_reward": 0.96875,
      "rewards/cargo_clippy_reward": 0.96875,
      "rewards/cargo_test_reward": 0.15625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8828125,
      "step": 397
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 791.0,
      "completions/max_terminated_length": 791.0,
      "completions/mean_length": 363.625,
      "completions/mean_terminated_length": 363.625,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "epoch": 0.16211812627291242,
      "grad_norm": 0.10485223516833546,
      "kl": 1.342529296875,
      "learning_rate": 1e-06,
      "loss": 0.0026,
      "num_tokens": 10734283.0,
      "reward": 5.046875,
      "reward_std": 0.567676842212677,
      "rewards/cargo_build_reward": 0.9375,
      "rewards/cargo_clippy_reward": 0.9375,
      "rewards/cargo_test_reward": 0.125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.921875,
      "step": 398
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 664.0,
      "completions/max_terminated_length": 664.0,
      "completions/mean_length": 267.96875,
      "completions/mean_terminated_length": 267.96875,
      "completions/min_length": 89.0,
      "completions/min_terminated_length": 89.0,
      "epoch": 0.1625254582484725,
      "grad_norm": 0.07787048703564636,
      "kl": 1.0731201171875,
      "learning_rate": 1e-06,
      "loss": -0.0014,
      "num_tokens": 10757706.0,
      "reward": 5.4609375,
      "reward_std": 0.2630031108856201,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.5,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7109375,
      "step": 399
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 732.0,
      "completions/max_terminated_length": 732.0,
      "completions/mean_length": 474.03125,
      "completions/mean_terminated_length": 474.03125,
      "completions/min_length": 324.0,
      "completions/min_terminated_length": 324.0,
      "epoch": 0.1629327902240326,
      "grad_norm": 0.20822367287756927,
      "kl": 1.23486328125,
      "learning_rate": 1e-06,
      "loss": 0.0066,
      "num_tokens": 10787763.0,
      "reward": 4.6328125,
      "reward_std": 1.002000093460083,
      "rewards/cargo_build_reward": 0.75,
      "rewards/cargo_clippy_reward": 0.75,
      "rewards/cargo_test_reward": 0.25,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.6328125,
      "step": 400
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 899.0,
      "completions/max_terminated_length": 899.0,
      "completions/mean_length": 393.78125,
      "completions/mean_terminated_length": 393.78125,
      "completions/min_length": 213.0,
      "completions/min_terminated_length": 213.0,
      "epoch": 0.16334012219959268,
      "grad_norm": 0.20937612914834222,
      "kl": 0.8896484375,
      "learning_rate": 1e-06,
      "loss": 0.0012,
      "num_tokens": 10814844.0,
      "reward": 5.0546875,
      "reward_std": 0.8593862056732178,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.1875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9609375,
      "step": 401
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 691.0,
      "completions/max_terminated_length": 691.0,
      "completions/mean_length": 352.5,
      "completions/mean_terminated_length": 352.5,
      "completions/min_length": 191.0,
      "completions/min_terminated_length": 191.0,
      "epoch": 0.16374745417515274,
      "grad_norm": 0.1809280665979603,
      "kl": 0.12677001953125,
      "learning_rate": 1e-06,
      "loss": 0.0101,
      "num_tokens": 10841004.0,
      "reward": 5.7109375,
      "reward_std": 1.421525001525879,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.53125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8984375,
      "step": 402
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 782.0,
      "completions/max_terminated_length": 782.0,
      "completions/mean_length": 301.5625,
      "completions/mean_terminated_length": 301.5625,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "epoch": 0.16415478615071283,
      "grad_norm": 0.09698607024115674,
      "kl": 0.07666015625,
      "learning_rate": 1e-06,
      "loss": 0.0016,
      "num_tokens": 10865790.0,
      "reward": 5.515625,
      "reward_std": 0.7098821401596069,
      "rewards/cargo_build_reward": 0.9375,
      "rewards/cargo_clippy_reward": 0.9375,
      "rewards/cargo_test_reward": 0.53125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.578125,
      "step": 403
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 611.0,
      "completions/max_terminated_length": 611.0,
      "completions/mean_length": 371.71875,
      "completions/mean_terminated_length": 371.71875,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "epoch": 0.16456211812627292,
      "grad_norm": 0.18732994989651297,
      "kl": 0.161865234375,
      "learning_rate": 1e-06,
      "loss": 0.0081,
      "num_tokens": 10892269.0,
      "reward": 4.21875,
      "reward_std": 1.2697362899780273,
      "rewards/cargo_build_reward": 0.59375,
      "rewards/cargo_clippy_reward": 0.59375,
      "rewards/cargo_test_reward": 0.21875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.625,
      "step": 404
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 771.0,
      "completions/max_terminated_length": 771.0,
      "completions/mean_length": 350.3125,
      "completions/mean_terminated_length": 350.3125,
      "completions/min_length": 202.0,
      "completions/min_terminated_length": 202.0,
      "epoch": 0.164969450101833,
      "grad_norm": 0.13301919129004566,
      "kl": 0.1346435546875,
      "learning_rate": 1e-06,
      "loss": 0.0039,
      "num_tokens": 10918631.0,
      "reward": 5.3515625,
      "reward_std": 0.9757429361343384,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9140625,
      "step": 405
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 519.0,
      "completions/max_terminated_length": 519.0,
      "completions/mean_length": 313.75,
      "completions/mean_terminated_length": 313.75,
      "completions/min_length": 222.0,
      "completions/min_terminated_length": 222.0,
      "epoch": 0.16537678207739306,
      "grad_norm": 0.10968180274000976,
      "kl": 0.10076904296875,
      "learning_rate": 1e-06,
      "loss": 0.0006,
      "num_tokens": 10943543.0,
      "reward": 5.0078125,
      "reward_std": 0.783983588218689,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.15625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9453125,
      "step": 406
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 447.0,
      "completions/max_terminated_length": 447.0,
      "completions/mean_length": 303.9375,
      "completions/mean_terminated_length": 303.9375,
      "completions/min_length": 194.0,
      "completions/min_terminated_length": 194.0,
      "epoch": 0.16578411405295315,
      "grad_norm": 0.08829224918003574,
      "kl": 0.0755615234375,
      "learning_rate": 1e-06,
      "loss": 0.0027,
      "num_tokens": 10968541.0,
      "reward": 5.359375,
      "reward_std": 0.68646639585495,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.921875,
      "step": 407
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1042.0,
      "completions/max_terminated_length": 1042.0,
      "completions/mean_length": 417.90625,
      "completions/mean_terminated_length": 417.90625,
      "completions/min_length": 209.0,
      "completions/min_terminated_length": 209.0,
      "epoch": 0.16619144602851324,
      "grad_norm": 0.15879792392845696,
      "kl": 0.1019287109375,
      "learning_rate": 1e-06,
      "loss": 0.001,
      "num_tokens": 10996930.0,
      "reward": 4.5625,
      "reward_std": 0.992995023727417,
      "rewards/cargo_build_reward": 0.75,
      "rewards/cargo_clippy_reward": 0.75,
      "rewards/cargo_test_reward": 0.09375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.875,
      "step": 408
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 746.0,
      "completions/max_terminated_length": 746.0,
      "completions/mean_length": 350.5,
      "completions/mean_terminated_length": 350.5,
      "completions/min_length": 206.0,
      "completions/min_terminated_length": 206.0,
      "epoch": 0.16659877800407333,
      "grad_norm": 0.1618737948435576,
      "kl": 0.06365966796875,
      "learning_rate": 1e-06,
      "loss": 0.0128,
      "num_tokens": 11022642.0,
      "reward": 5.1171875,
      "reward_std": 1.2929661273956299,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7421875,
      "step": 409
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 967.0,
      "completions/max_terminated_length": 967.0,
      "completions/mean_length": 370.21875,
      "completions/mean_terminated_length": 370.21875,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "epoch": 0.1670061099796334,
      "grad_norm": 0.1469536219914811,
      "kl": 0.0606689453125,
      "learning_rate": 1e-06,
      "loss": 0.0088,
      "num_tokens": 11049953.0,
      "reward": 5.5,
      "reward_std": 0.8628734350204468,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.5,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.84375,
      "step": 410
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 836.0,
      "completions/max_terminated_length": 836.0,
      "completions/mean_length": 344.1875,
      "completions/mean_terminated_length": 355.2903225806452,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 223.0,
      "epoch": 0.16741344195519348,
      "grad_norm": 0.28361110669123324,
      "kl": 0.15478515625,
      "learning_rate": 1e-06,
      "loss": -0.0111,
      "num_tokens": 11079064.0,
      "reward": 4.9296875,
      "reward_std": 1.547074556350708,
      "rewards/cargo_build_reward": 0.78125,
      "rewards/cargo_clippy_reward": 0.78125,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 0.96875,
      "rewards/non_empty_reward": 0.96875,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.8671875,
      "step": 411
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 672.0,
      "completions/max_terminated_length": 672.0,
      "completions/mean_length": 337.9375,
      "completions/mean_terminated_length": 337.9375,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "epoch": 0.16782077393075356,
      "grad_norm": 0.15166421938927865,
      "kl": 0.1522216796875,
      "learning_rate": 1e-06,
      "loss": 0.0051,
      "num_tokens": 11104446.0,
      "reward": 5.8046875,
      "reward_std": 0.7720569372177124,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7421875,
      "step": 412
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 633.0,
      "completions/max_terminated_length": 633.0,
      "completions/mean_length": 329.96875,
      "completions/mean_terminated_length": 329.96875,
      "completions/min_length": 194.0,
      "completions/min_terminated_length": 194.0,
      "epoch": 0.16822810590631365,
      "grad_norm": 0.06293129841573249,
      "kl": 0.1748046875,
      "learning_rate": 1e-06,
      "loss": -0.0012,
      "num_tokens": 11129573.0,
      "reward": 4.609375,
      "reward_std": 0.5376979112625122,
      "rewards/cargo_build_reward": 0.75,
      "rewards/cargo_clippy_reward": 0.75,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.546875,
      "step": 413
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 723.0,
      "completions/max_terminated_length": 723.0,
      "completions/mean_length": 356.625,
      "completions/mean_terminated_length": 356.625,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "epoch": 0.16863543788187374,
      "grad_norm": 0.22461425052244266,
      "kl": 0.1195068359375,
      "learning_rate": 1e-06,
      "loss": -0.0016,
      "num_tokens": 11156137.0,
      "reward": 4.640625,
      "reward_std": 1.0310025215148926,
      "rewards/cargo_build_reward": 0.5625,
      "rewards/cargo_clippy_reward": 0.5625,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.828125,
      "step": 414
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 384.0,
      "completions/mean_length": 273.875,
      "completions/mean_terminated_length": 273.875,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "epoch": 0.1690427698574338,
      "grad_norm": 0.08429891831942984,
      "kl": 0.1258544921875,
      "learning_rate": 1e-06,
      "loss": 0.0005,
      "num_tokens": 11180173.0,
      "reward": 6.421875,
      "reward_std": 0.5966733694076538,
      "rewards/cargo_build_reward": 1.0,
      "rewards/cargo_clippy_reward": 1.0,
      "rewards/cargo_test_reward": 0.75,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.921875,
      "step": 415
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 595.0,
      "completions/max_terminated_length": 595.0,
      "completions/mean_length": 276.09375,
      "completions/mean_terminated_length": 276.09375,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "epoch": 0.1694501018329939,
      "grad_norm": 0.1456370513489768,
      "kl": 0.41259765625,
      "learning_rate": 1e-06,
      "loss": 0.0023,
      "num_tokens": 11203464.0,
      "reward": 6.0390625,
      "reward_std": 0.7834969758987427,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.6875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8515625,
      "step": 416
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 963.0,
      "completions/max_terminated_length": 963.0,
      "completions/mean_length": 442.8125,
      "completions/mean_terminated_length": 442.8125,
      "completions/min_length": 262.0,
      "completions/min_terminated_length": 262.0,
      "epoch": 0.16985743380855398,
      "grad_norm": 0.18604758304984106,
      "kl": 0.11004638671875,
      "learning_rate": 1e-06,
      "loss": 0.0198,
      "num_tokens": 11232954.0,
      "reward": 5.1953125,
      "reward_std": 1.1531951427459717,
      "rewards/cargo_build_reward": 0.78125,
      "rewards/cargo_clippy_reward": 0.78125,
      "rewards/cargo_test_reward": 0.40625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8203125,
      "step": 417
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 623.0,
      "completions/max_terminated_length": 623.0,
      "completions/mean_length": 287.53125,
      "completions/mean_terminated_length": 287.53125,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "epoch": 0.17026476578411406,
      "grad_norm": 0.12467595783642056,
      "kl": 0.138916015625,
      "learning_rate": 1e-06,
      "loss": 0.002,
      "num_tokens": 11256659.0,
      "reward": 5.9453125,
      "reward_std": 1.1238775253295898,
      "rewards/cargo_build_reward": 0.96875,
      "rewards/cargo_clippy_reward": 0.96875,
      "rewards/cargo_test_reward": 0.59375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8203125,
      "step": 418
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 668.0,
      "completions/max_terminated_length": 668.0,
      "completions/mean_length": 323.21875,
      "completions/mean_terminated_length": 323.21875,
      "completions/min_length": 213.0,
      "completions/min_terminated_length": 213.0,
      "epoch": 0.17067209775967412,
      "grad_norm": 0.11434602722609279,
      "kl": 0.607177734375,
      "learning_rate": 1e-06,
      "loss": 0.0059,
      "num_tokens": 11282434.0,
      "reward": 4.59375,
      "reward_std": 0.6790916323661804,
      "rewards/cargo_build_reward": 0.6875,
      "rewards/cargo_clippy_reward": 0.6875,
      "rewards/cargo_test_reward": 0.3125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.59375,
      "step": 419
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 835.0,
      "completions/max_terminated_length": 835.0,
      "completions/mean_length": 446.6875,
      "completions/mean_terminated_length": 446.6875,
      "completions/min_length": 210.0,
      "completions/min_terminated_length": 210.0,
      "epoch": 0.1710794297352342,
      "grad_norm": 0.19793586572081812,
      "kl": 1.4931640625,
      "learning_rate": 1e-06,
      "loss": -0.0069,
      "num_tokens": 11312552.0,
      "reward": 4.3359375,
      "reward_std": 1.09117591381073,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7734375,
      "step": 420
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 791.0,
      "completions/max_terminated_length": 791.0,
      "completions/mean_length": 380.65625,
      "completions/mean_terminated_length": 380.65625,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "epoch": 0.1714867617107943,
      "grad_norm": 0.10793400514470229,
      "kl": 0.2606201171875,
      "learning_rate": 1e-06,
      "loss": -0.0013,
      "num_tokens": 11339461.0,
      "reward": 5.28125,
      "reward_std": 0.6398772597312927,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.3125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.84375,
      "step": 421
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 508.0,
      "completions/max_terminated_length": 508.0,
      "completions/mean_length": 297.5625,
      "completions/mean_terminated_length": 297.5625,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "epoch": 0.1718940936863544,
      "grad_norm": 0.19091039429941775,
      "kl": 0.3817138671875,
      "learning_rate": 1e-06,
      "loss": 0.005,
      "num_tokens": 11363895.0,
      "reward": 5.6796875,
      "reward_std": 1.2723278999328613,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.53125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9921875,
      "step": 422
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 721.0,
      "completions/max_terminated_length": 721.0,
      "completions/mean_length": 344.625,
      "completions/mean_terminated_length": 344.625,
      "completions/min_length": 199.0,
      "completions/min_terminated_length": 199.0,
      "epoch": 0.17230142566191445,
      "grad_norm": 0.11652509405044195,
      "kl": 0.5419921875,
      "learning_rate": 1e-06,
      "loss": 0.0026,
      "num_tokens": 11390211.0,
      "reward": 5.921875,
      "reward_std": 0.805986225605011,
      "rewards/cargo_build_reward": 0.96875,
      "rewards/cargo_clippy_reward": 0.96875,
      "rewards/cargo_test_reward": 0.53125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.921875,
      "step": 423
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 831.0,
      "completions/max_terminated_length": 831.0,
      "completions/mean_length": 352.03125,
      "completions/mean_terminated_length": 352.03125,
      "completions/min_length": 206.0,
      "completions/min_terminated_length": 206.0,
      "epoch": 0.17270875763747454,
      "grad_norm": 0.0909377094601466,
      "kl": 0.56005859375,
      "learning_rate": 1e-06,
      "loss": 0.0021,
      "num_tokens": 11416860.0,
      "reward": 5.5703125,
      "reward_std": 0.6062199473381042,
      "rewards/cargo_build_reward": 0.96875,
      "rewards/cargo_clippy_reward": 0.96875,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8828125,
      "step": 424
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 888.0,
      "completions/max_terminated_length": 888.0,
      "completions/mean_length": 478.84375,
      "completions/mean_terminated_length": 478.84375,
      "completions/min_length": 214.0,
      "completions/min_terminated_length": 214.0,
      "epoch": 0.17311608961303462,
      "grad_norm": 0.21006478717124752,
      "kl": 0.494140625,
      "learning_rate": 1e-06,
      "loss": 0.0077,
      "num_tokens": 11448007.0,
      "reward": 4.59375,
      "reward_std": 1.4197256565093994,
      "rewards/cargo_build_reward": 0.625,
      "rewards/cargo_clippy_reward": 0.625,
      "rewards/cargo_test_reward": 0.21875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.90625,
      "step": 425
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1124.0,
      "completions/max_terminated_length": 1124.0,
      "completions/mean_length": 328.3125,
      "completions/mean_terminated_length": 328.3125,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "epoch": 0.1735234215885947,
      "grad_norm": 0.17995730877645025,
      "kl": 0.34423828125,
      "learning_rate": 1e-06,
      "loss": 0.0238,
      "num_tokens": 11474065.0,
      "reward": 6.015625,
      "reward_std": 0.8857485055923462,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.6875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.828125,
      "step": 426
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 645.0,
      "completions/max_terminated_length": 645.0,
      "completions/mean_length": 393.0625,
      "completions/mean_terminated_length": 393.0625,
      "completions/min_length": 216.0,
      "completions/min_terminated_length": 216.0,
      "epoch": 0.1739307535641548,
      "grad_norm": 0.17681931663172054,
      "kl": 0.390625,
      "learning_rate": 1e-06,
      "loss": -0.0016,
      "num_tokens": 11502435.0,
      "reward": 4.953125,
      "reward_std": 1.093381404876709,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.21875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.828125,
      "step": 427
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 657.0,
      "completions/max_terminated_length": 657.0,
      "completions/mean_length": 362.84375,
      "completions/mean_terminated_length": 362.84375,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "epoch": 0.17433808553971486,
      "grad_norm": 0.19182910026101962,
      "kl": 1.423828125,
      "learning_rate": 1e-06,
      "loss": -0.0031,
      "num_tokens": 11529318.0,
      "reward": 5.234375,
      "reward_std": 1.2473208904266357,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.875,
      "step": 428
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 916.0,
      "completions/max_terminated_length": 916.0,
      "completions/mean_length": 478.375,
      "completions/mean_terminated_length": 478.375,
      "completions/min_length": 244.0,
      "completions/min_terminated_length": 244.0,
      "epoch": 0.17474541751527495,
      "grad_norm": 0.10548379457160806,
      "kl": 0.30322265625,
      "learning_rate": 1e-06,
      "loss": 0.0008,
      "num_tokens": 11558874.0,
      "reward": 4.453125,
      "reward_std": 0.6049275398254395,
      "rewards/cargo_build_reward": 0.75,
      "rewards/cargo_clippy_reward": 0.71875,
      "rewards/cargo_test_reward": 0.0625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.859375,
      "step": 429
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 481.0,
      "completions/max_terminated_length": 481.0,
      "completions/mean_length": 349.75,
      "completions/mean_terminated_length": 349.75,
      "completions/min_length": 222.0,
      "completions/min_terminated_length": 222.0,
      "epoch": 0.17515274949083504,
      "grad_norm": 0.1807422808173608,
      "kl": 0.1004638671875,
      "learning_rate": 1e-06,
      "loss": 0.003,
      "num_tokens": 11585274.0,
      "reward": 5.421875,
      "reward_std": 1.433083415031433,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.78125,
      "rewards/cargo_test_reward": 0.4375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.890625,
      "step": 430
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 975.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 427.15625,
      "completions/mean_terminated_length": 427.15625,
      "completions/min_length": 188.0,
      "completions/min_terminated_length": 188.0,
      "epoch": 0.17556008146639512,
      "grad_norm": 0.1610145168588778,
      "kl": 0.323974609375,
      "learning_rate": 1e-06,
      "loss": 0.0048,
      "num_tokens": 11613639.0,
      "reward": 5.03125,
      "reward_std": 1.1478092670440674,
      "rewards/cargo_build_reward": 0.65625,
      "rewards/cargo_clippy_reward": 0.65625,
      "rewards/cargo_test_reward": 0.46875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.78125,
      "step": 431
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 580.0,
      "completions/max_terminated_length": 580.0,
      "completions/mean_length": 370.375,
      "completions/mean_terminated_length": 370.375,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "epoch": 0.17596741344195518,
      "grad_norm": 0.1301613036980156,
      "kl": 0.260009765625,
      "learning_rate": 1e-06,
      "loss": 0.0027,
      "num_tokens": 11639987.0,
      "reward": 5.2890625,
      "reward_std": 0.7868574857711792,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9140625,
      "step": 432
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 567.0,
      "completions/max_terminated_length": 567.0,
      "completions/mean_length": 286.46875,
      "completions/mean_terminated_length": 286.46875,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "epoch": 0.17637474541751527,
      "grad_norm": 0.10529105951652248,
      "kl": 0.5634765625,
      "learning_rate": 1e-06,
      "loss": 0.0006,
      "num_tokens": 11663602.0,
      "reward": 5.8984375,
      "reward_std": 0.6449092030525208,
      "rewards/cargo_build_reward": 0.9375,
      "rewards/cargo_clippy_reward": 0.9375,
      "rewards/cargo_test_reward": 0.5625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8984375,
      "step": 433
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 968.0,
      "completions/max_terminated_length": 968.0,
      "completions/mean_length": 374.8125,
      "completions/mean_terminated_length": 374.8125,
      "completions/min_length": 179.0,
      "completions/min_terminated_length": 179.0,
      "epoch": 0.17678207739307536,
      "grad_norm": 0.2267686821810141,
      "kl": 0.27734375,
      "learning_rate": 1e-06,
      "loss": -0.0001,
      "num_tokens": 11690908.0,
      "reward": 4.703125,
      "reward_std": 1.112496256828308,
      "rewards/cargo_build_reward": 0.78125,
      "rewards/cargo_clippy_reward": 0.78125,
      "rewards/cargo_test_reward": 0.1875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.765625,
      "step": 434
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 622.0,
      "completions/max_terminated_length": 622.0,
      "completions/mean_length": 361.375,
      "completions/mean_terminated_length": 361.375,
      "completions/min_length": 196.0,
      "completions/min_terminated_length": 196.0,
      "epoch": 0.17718940936863545,
      "grad_norm": 0.15644434744266977,
      "kl": 0.305419921875,
      "learning_rate": 1e-06,
      "loss": 0.008,
      "num_tokens": 11717744.0,
      "reward": 5.2265625,
      "reward_std": 0.9309062957763672,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.46875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.6640625,
      "step": 435
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 980.0,
      "completions/max_terminated_length": 980.0,
      "completions/mean_length": 517.21875,
      "completions/mean_terminated_length": 517.21875,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "epoch": 0.1775967413441955,
      "grad_norm": 0.21620849209453313,
      "kl": 0.273681640625,
      "learning_rate": 1e-06,
      "loss": -0.0021,
      "num_tokens": 11748863.0,
      "reward": 4.9375,
      "reward_std": 1.206173300743103,
      "rewards/cargo_build_reward": 0.75,
      "rewards/cargo_clippy_reward": 0.75,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.875,
      "step": 436
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 594.0,
      "completions/max_terminated_length": 594.0,
      "completions/mean_length": 286.5,
      "completions/mean_terminated_length": 286.5,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "epoch": 0.1780040733197556,
      "grad_norm": 0.11184396135570342,
      "kl": 0.29833984375,
      "learning_rate": 1e-06,
      "loss": 0.0016,
      "num_tokens": 11772679.0,
      "reward": 5.0625,
      "reward_std": 0.7270735502243042,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.25,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.875,
      "step": 437
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 736.0,
      "completions/max_terminated_length": 736.0,
      "completions/mean_length": 327.34375,
      "completions/mean_terminated_length": 327.34375,
      "completions/min_length": 187.0,
      "completions/min_terminated_length": 187.0,
      "epoch": 0.17841140529531568,
      "grad_norm": 0.21751636937732396,
      "kl": 1.451171875,
      "learning_rate": 1e-06,
      "loss": 0.0008,
      "num_tokens": 11798578.0,
      "reward": 5.84375,
      "reward_std": 1.2837473154067993,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.65625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.78125,
      "step": 438
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 710.0,
      "completions/max_terminated_length": 710.0,
      "completions/mean_length": 405.09375,
      "completions/mean_terminated_length": 405.09375,
      "completions/min_length": 242.0,
      "completions/min_terminated_length": 242.0,
      "epoch": 0.17881873727087577,
      "grad_norm": 0.19611327561773098,
      "kl": 0.892333984375,
      "learning_rate": 1e-06,
      "loss": 0.0061,
      "num_tokens": 11826781.0,
      "reward": 4.59375,
      "reward_std": 1.0302538871765137,
      "rewards/cargo_build_reward": 0.78125,
      "rewards/cargo_clippy_reward": 0.78125,
      "rewards/cargo_test_reward": 0.09375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.84375,
      "step": 439
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 730.0,
      "completions/max_terminated_length": 730.0,
      "completions/mean_length": 356.3125,
      "completions/mean_terminated_length": 356.3125,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "epoch": 0.17922606924643583,
      "grad_norm": 0.15175189330360042,
      "kl": 0.30322265625,
      "learning_rate": 1e-06,
      "loss": -0.0032,
      "num_tokens": 11853647.0,
      "reward": 5.71875,
      "reward_std": 0.9608312845230103,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.59375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8125,
      "step": 440
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 647.0,
      "completions/max_terminated_length": 647.0,
      "completions/mean_length": 330.5,
      "completions/mean_terminated_length": 330.5,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "epoch": 0.17963340122199592,
      "grad_norm": 0.23551920970756896,
      "kl": 0.458251953125,
      "learning_rate": 1e-06,
      "loss": -0.0033,
      "num_tokens": 11879143.0,
      "reward": 5.34375,
      "reward_std": 1.275673508644104,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.5625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.53125,
      "step": 441
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 715.0,
      "completions/max_terminated_length": 715.0,
      "completions/mean_length": 341.96875,
      "completions/mean_terminated_length": 341.96875,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "epoch": 0.180040733197556,
      "grad_norm": 0.19668837839183034,
      "kl": 0.85009765625,
      "learning_rate": 1e-06,
      "loss": 0.0072,
      "num_tokens": 11904430.0,
      "reward": 5.828125,
      "reward_std": 1.0695008039474487,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.53125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.953125,
      "step": 442
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 432.0,
      "completions/max_terminated_length": 432.0,
      "completions/mean_length": 294.125,
      "completions/mean_terminated_length": 294.125,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "epoch": 0.1804480651731161,
      "grad_norm": 0.15123640668333174,
      "kl": 0.57861328125,
      "learning_rate": 1e-06,
      "loss": -0.0018,
      "num_tokens": 11927770.0,
      "reward": 5.7109375,
      "reward_std": 0.8943933844566345,
      "rewards/cargo_build_reward": 1.0,
      "rewards/cargo_clippy_reward": 1.0,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9609375,
      "step": 443
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 491.0,
      "completions/max_terminated_length": 491.0,
      "completions/mean_length": 355.1875,
      "completions/mean_terminated_length": 354.8709677419355,
      "completions/min_length": 236.0,
      "completions/min_terminated_length": 236.0,
      "epoch": 0.18085539714867618,
      "grad_norm": 0.3399968048336642,
      "kl": 3.849609375,
      "learning_rate": 1e-06,
      "loss": 0.0042,
      "num_tokens": 11954120.0,
      "reward": 5.6171875,
      "reward_std": 0.9756202101707458,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.4375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9296875,
      "step": 444
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 585.0,
      "completions/max_terminated_length": 585.0,
      "completions/mean_length": 363.71875,
      "completions/mean_terminated_length": 363.71875,
      "completions/min_length": 251.0,
      "completions/min_terminated_length": 251.0,
      "epoch": 0.18126272912423624,
      "grad_norm": 0.0856631271266159,
      "kl": 0.323486328125,
      "learning_rate": 1e-06,
      "loss": 0.0025,
      "num_tokens": 11981247.0,
      "reward": 4.9140625,
      "reward_std": 0.4155312776565552,
      "rewards/cargo_build_reward": 0.96875,
      "rewards/cargo_clippy_reward": 0.96875,
      "rewards/cargo_test_reward": 0.03125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9140625,
      "step": 445
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 674.0,
      "completions/max_terminated_length": 674.0,
      "completions/mean_length": 303.09375,
      "completions/mean_terminated_length": 303.09375,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "epoch": 0.18167006109979633,
      "grad_norm": 0.5581568917702728,
      "kl": 6.1220703125,
      "learning_rate": 1e-06,
      "loss": 0.005,
      "num_tokens": 12005490.0,
      "reward": 5.9765625,
      "reward_std": 0.9034623503684998,
      "rewards/cargo_build_reward": 0.96875,
      "rewards/cargo_clippy_reward": 0.96875,
      "rewards/cargo_test_reward": 0.625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7890625,
      "step": 446
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 776.0,
      "completions/max_terminated_length": 776.0,
      "completions/mean_length": 400.25,
      "completions/mean_terminated_length": 400.25,
      "completions/min_length": 215.0,
      "completions/min_terminated_length": 215.0,
      "epoch": 0.18207739307535642,
      "grad_norm": 0.16536634201006845,
      "kl": 0.5810546875,
      "learning_rate": 1e-06,
      "loss": 0.0013,
      "num_tokens": 12033810.0,
      "reward": 5.2421875,
      "reward_std": 0.9611786603927612,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.6796875,
      "step": 447
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 428.0,
      "completions/max_terminated_length": 428.0,
      "completions/mean_length": 269.21875,
      "completions/mean_terminated_length": 269.21875,
      "completions/min_length": 188.0,
      "completions/min_terminated_length": 188.0,
      "epoch": 0.1824847250509165,
      "grad_norm": 0.12243037217908188,
      "kl": 0.651611328125,
      "learning_rate": 1e-06,
      "loss": -0.0005,
      "num_tokens": 12057809.0,
      "reward": 6.421875,
      "reward_std": 0.992490291595459,
      "rewards/cargo_build_reward": 0.9375,
      "rewards/cargo_clippy_reward": 0.9375,
      "rewards/cargo_test_reward": 0.84375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.859375,
      "step": 448
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 763.0,
      "completions/max_terminated_length": 763.0,
      "completions/mean_length": 420.0,
      "completions/mean_terminated_length": 420.0,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "epoch": 0.18289205702647657,
      "grad_norm": 0.1586099231207866,
      "kl": 1.142578125,
      "learning_rate": 1e-06,
      "loss": 0.004,
      "num_tokens": 12085865.0,
      "reward": 4.5859375,
      "reward_std": 0.9468885064125061,
      "rewards/cargo_build_reward": 0.71875,
      "rewards/cargo_clippy_reward": 0.71875,
      "rewards/cargo_test_reward": 0.1875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7734375,
      "step": 449
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 625.0,
      "completions/max_terminated_length": 625.0,
      "completions/mean_length": 361.375,
      "completions/mean_terminated_length": 361.375,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "epoch": 0.18329938900203666,
      "grad_norm": 0.19762038686403974,
      "kl": 0.309326171875,
      "learning_rate": 1e-06,
      "loss": 0.0051,
      "num_tokens": 12112197.0,
      "reward": 5.2890625,
      "reward_std": 1.0431478023529053,
      "rewards/cargo_build_reward": 0.78125,
      "rewards/cargo_clippy_reward": 0.78125,
      "rewards/cargo_test_reward": 0.40625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.9296875,
      "step": 450
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 778.0,
      "completions/max_terminated_length": 778.0,
      "completions/mean_length": 398.46875,
      "completions/mean_terminated_length": 398.46875,
      "completions/min_length": 231.0,
      "completions/min_terminated_length": 231.0,
      "epoch": 0.18370672097759674,
      "grad_norm": 0.11791037915643654,
      "kl": 0.3271484375,
      "learning_rate": 1e-06,
      "loss": -0.0041,
      "num_tokens": 12139140.0,
      "reward": 5.453125,
      "reward_std": 0.7159276008605957,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.40625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.953125,
      "step": 451
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 603.0,
      "completions/max_terminated_length": 603.0,
      "completions/mean_length": 387.59375,
      "completions/mean_terminated_length": 387.59375,
      "completions/min_length": 195.0,
      "completions/min_terminated_length": 195.0,
      "epoch": 0.18411405295315683,
      "grad_norm": 0.19748641854512347,
      "kl": 1.38232421875,
      "learning_rate": 1e-06,
      "loss": 0.0083,
      "num_tokens": 12166903.0,
      "reward": 4.7109375,
      "reward_std": 0.9857768416404724,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.09375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8984375,
      "step": 452
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 734.0,
      "completions/max_terminated_length": 734.0,
      "completions/mean_length": 370.8125,
      "completions/mean_terminated_length": 395.53333333333336,
      "completions/min_length": 0.0,
      "completions/min_terminated_length": 242.0,
      "epoch": 0.1845213849287169,
      "grad_norm": 0.26793835232040064,
      "kl": 1.05419921875,
      "learning_rate": 1e-06,
      "loss": -0.0252,
      "num_tokens": 12201155.0,
      "reward": 4.921875,
      "reward_std": 1.447327971458435,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.25,
      "rewards/code_block_count_reward": 0.9375,
      "rewards/non_empty_reward": 0.9375,
      "rewards/test_block_count_reward": 0.9375,
      "rewards/tests_have_asserts_reward": 0.921875,
      "step": 453
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 558.0,
      "completions/max_terminated_length": 558.0,
      "completions/mean_length": 258.46875,
      "completions/mean_terminated_length": 258.46875,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "epoch": 0.18492871690427698,
      "grad_norm": 0.11951130654168295,
      "kl": 0.2705078125,
      "learning_rate": 1e-06,
      "loss": 0.0,
      "num_tokens": 12224282.0,
      "reward": 5.2109375,
      "reward_std": 0.8302067518234253,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.6484375,
      "step": 454
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 578.0,
      "completions/max_terminated_length": 578.0,
      "completions/mean_length": 350.15625,
      "completions/mean_terminated_length": 350.15625,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "epoch": 0.18533604887983707,
      "grad_norm": 0.22151210736583518,
      "kl": 1.805908203125,
      "learning_rate": 1e-06,
      "loss": 0.0042,
      "num_tokens": 12250799.0,
      "reward": 5.8125,
      "reward_std": 0.5101194381713867,
      "rewards/cargo_build_reward": 0.9375,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.65625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.65625,
      "step": 455
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 872.0,
      "completions/max_terminated_length": 872.0,
      "completions/mean_length": 415.0,
      "completions/mean_terminated_length": 415.0,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "epoch": 0.18574338085539716,
      "grad_norm": 0.19235112400930446,
      "kl": 0.5244140625,
      "learning_rate": 1e-06,
      "loss": -0.0024,
      "num_tokens": 12279199.0,
      "reward": 5.2109375,
      "reward_std": 1.1287403106689453,
      "rewards/cargo_build_reward": 0.75,
      "rewards/cargo_clippy_reward": 0.75,
      "rewards/cargo_test_reward": 0.40625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8984375,
      "step": 456
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 609.0,
      "completions/max_terminated_length": 609.0,
      "completions/mean_length": 324.96875,
      "completions/mean_terminated_length": 324.96875,
      "completions/min_length": 187.0,
      "completions/min_terminated_length": 187.0,
      "epoch": 0.18615071283095724,
      "grad_norm": 0.2618561387504406,
      "kl": 2.25390625,
      "learning_rate": 1e-06,
      "loss": -0.0004,
      "num_tokens": 12304854.0,
      "reward": 5.796875,
      "reward_std": 0.9935402870178223,
      "rewards/cargo_build_reward": 0.9375,
      "rewards/cargo_clippy_reward": 0.9375,
      "rewards/cargo_test_reward": 0.53125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.859375,
      "step": 457
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 926.0,
      "completions/max_terminated_length": 926.0,
      "completions/mean_length": 327.0,
      "completions/mean_terminated_length": 327.0,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "epoch": 0.1865580448065173,
      "grad_norm": 0.14057633470756206,
      "kl": 1.26025390625,
      "learning_rate": 1e-06,
      "loss": 0.0038,
      "num_tokens": 12330454.0,
      "reward": 5.609375,
      "reward_std": 0.8704342246055603,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.5,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.796875,
      "step": 458
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 622.0,
      "completions/max_terminated_length": 622.0,
      "completions/mean_length": 380.0,
      "completions/mean_terminated_length": 380.0,
      "completions/min_length": 235.0,
      "completions/min_terminated_length": 235.0,
      "epoch": 0.1869653767820774,
      "grad_norm": 0.14731921461654593,
      "kl": 0.5023193359375,
      "learning_rate": 1e-06,
      "loss": 0.0037,
      "num_tokens": 12358278.0,
      "reward": 4.8984375,
      "reward_std": 0.999342679977417,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.1875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8984375,
      "step": 459
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 408.28125,
      "completions/mean_terminated_length": 408.28125,
      "completions/min_length": 237.0,
      "completions/min_terminated_length": 237.0,
      "epoch": 0.18737270875763748,
      "grad_norm": 0.15995704734078894,
      "kl": 0.92578125,
      "learning_rate": 1e-06,
      "loss": -0.0009,
      "num_tokens": 12386247.0,
      "reward": 4.609375,
      "reward_std": 1.1562089920043945,
      "rewards/cargo_build_reward": 0.6875,
      "rewards/cargo_clippy_reward": 0.6875,
      "rewards/cargo_test_reward": 0.1875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.859375,
      "step": 460
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 605.0,
      "completions/max_terminated_length": 605.0,
      "completions/mean_length": 356.5625,
      "completions/mean_terminated_length": 356.5625,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "epoch": 0.18778004073319757,
      "grad_norm": 0.20457059650057555,
      "kl": 0.61572265625,
      "learning_rate": 1e-06,
      "loss": 0.0005,
      "num_tokens": 12412649.0,
      "reward": 5.5625,
      "reward_std": 0.7501891851425171,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.5,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9375,
      "step": 461
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1093.0,
      "completions/max_terminated_length": 1093.0,
      "completions/mean_length": 396.40625,
      "completions/mean_terminated_length": 396.40625,
      "completions/min_length": 220.0,
      "completions/min_terminated_length": 220.0,
      "epoch": 0.18818737270875763,
      "grad_norm": 0.1654841583796105,
      "kl": 0.4827880859375,
      "learning_rate": 1e-06,
      "loss": 0.0085,
      "num_tokens": 12441142.0,
      "reward": 4.703125,
      "reward_std": 1.057011604309082,
      "rewards/cargo_build_reward": 0.78125,
      "rewards/cargo_clippy_reward": 0.75,
      "rewards/cargo_test_reward": 0.09375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.984375,
      "step": 462
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1373.0,
      "completions/max_terminated_length": 1373.0,
      "completions/mean_length": 528.75,
      "completions/mean_terminated_length": 528.75,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "epoch": 0.18859470468431772,
      "grad_norm": 0.33716659828305967,
      "kl": 0.581298828125,
      "learning_rate": 1e-06,
      "loss": 0.022,
      "num_tokens": 12473190.0,
      "reward": 5.0234375,
      "reward_std": 1.5583802461624146,
      "rewards/cargo_build_reward": 0.6875,
      "rewards/cargo_clippy_reward": 0.6875,
      "rewards/cargo_test_reward": 0.40625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8359375,
      "step": 463
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 601.0,
      "completions/max_terminated_length": 601.0,
      "completions/mean_length": 303.375,
      "completions/mean_terminated_length": 303.375,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "epoch": 0.1890020366598778,
      "grad_norm": 0.14091463041705182,
      "kl": 0.2255859375,
      "learning_rate": 1e-06,
      "loss": -0.0017,
      "num_tokens": 12497322.0,
      "reward": 5.2734375,
      "reward_std": 1.10199773311615,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8984375,
      "step": 464
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 861.0,
      "completions/max_terminated_length": 861.0,
      "completions/mean_length": 451.03125,
      "completions/mean_terminated_length": 451.03125,
      "completions/min_length": 278.0,
      "completions/min_terminated_length": 278.0,
      "epoch": 0.1894093686354379,
      "grad_norm": 0.12767800369089213,
      "kl": 0.304931640625,
      "learning_rate": 1e-06,
      "loss": -0.0029,
      "num_tokens": 12527443.0,
      "reward": 5.03125,
      "reward_std": 0.7347978353500366,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.3125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.65625,
      "step": 465
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 606.0,
      "completions/max_terminated_length": 606.0,
      "completions/mean_length": 325.25,
      "completions/mean_terminated_length": 325.25,
      "completions/min_length": 192.0,
      "completions/min_terminated_length": 192.0,
      "epoch": 0.18981670061099795,
      "grad_norm": 0.05474815313000358,
      "kl": 0.1480712890625,
      "learning_rate": 1e-06,
      "loss": 0.0003,
      "num_tokens": 12553059.0,
      "reward": 6.171875,
      "reward_std": 0.4042172133922577,
      "rewards/cargo_build_reward": 1.0,
      "rewards/cargo_clippy_reward": 1.0,
      "rewards/cargo_test_reward": 0.625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.921875,
      "step": 466
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 489.0,
      "completions/max_terminated_length": 489.0,
      "completions/mean_length": 281.125,
      "completions/mean_terminated_length": 281.125,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "epoch": 0.19022403258655804,
      "grad_norm": 0.11779257666699224,
      "kl": 0.45068359375,
      "learning_rate": 1e-06,
      "loss": 0.0006,
      "num_tokens": 12576679.0,
      "reward": 5.1171875,
      "reward_std": 0.5848700404167175,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7421875,
      "step": 467
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 565.0,
      "completions/max_terminated_length": 565.0,
      "completions/mean_length": 326.6875,
      "completions/mean_terminated_length": 326.6875,
      "completions/min_length": 227.0,
      "completions/min_terminated_length": 227.0,
      "epoch": 0.19063136456211813,
      "grad_norm": 0.1447683885773122,
      "kl": 0.330810546875,
      "learning_rate": 1e-06,
      "loss": 0.0031,
      "num_tokens": 12602325.0,
      "reward": 5.078125,
      "reward_std": 0.9557596445083618,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.3125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.828125,
      "step": 468
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 483.0,
      "completions/max_terminated_length": 483.0,
      "completions/mean_length": 369.3125,
      "completions/mean_terminated_length": 369.3125,
      "completions/min_length": 240.0,
      "completions/min_terminated_length": 240.0,
      "epoch": 0.19103869653767822,
      "grad_norm": 0.1527553488628639,
      "kl": 0.35943603515625,
      "learning_rate": 1e-06,
      "loss": 0.0016,
      "num_tokens": 12629103.0,
      "reward": 5.2578125,
      "reward_std": 0.8825398087501526,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.25,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9453125,
      "step": 469
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 728.0,
      "completions/max_terminated_length": 728.0,
      "completions/mean_length": 346.625,
      "completions/mean_terminated_length": 346.625,
      "completions/min_length": 214.0,
      "completions/min_terminated_length": 214.0,
      "epoch": 0.19144602851323828,
      "grad_norm": 0.1369293138745396,
      "kl": 0.5859375,
      "learning_rate": 1e-06,
      "loss": -0.0088,
      "num_tokens": 12654891.0,
      "reward": 5.9375,
      "reward_std": 0.761785089969635,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.5625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 1.0,
      "step": 470
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 908.0,
      "completions/max_terminated_length": 908.0,
      "completions/mean_length": 476.09375,
      "completions/mean_terminated_length": 476.09375,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "epoch": 0.19185336048879836,
      "grad_norm": 0.16020188437510458,
      "kl": 0.2183837890625,
      "learning_rate": 1e-06,
      "loss": 0.0008,
      "num_tokens": 12685414.0,
      "reward": 5.046875,
      "reward_std": 1.0090612173080444,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.25,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.921875,
      "step": 471
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 562.0,
      "completions/max_terminated_length": 562.0,
      "completions/mean_length": 333.5,
      "completions/mean_terminated_length": 333.5,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "epoch": 0.19226069246435845,
      "grad_norm": 0.18716380062978372,
      "kl": 0.240966796875,
      "learning_rate": 1e-06,
      "loss": -0.0048,
      "num_tokens": 12711390.0,
      "reward": 5.5625,
      "reward_std": 1.2912083864212036,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.46875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.875,
      "step": 472
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 843.0,
      "completions/max_terminated_length": 843.0,
      "completions/mean_length": 316.5625,
      "completions/mean_terminated_length": 316.5625,
      "completions/min_length": 166.0,
      "completions/min_terminated_length": 166.0,
      "epoch": 0.19266802443991854,
      "grad_norm": 0.12726416523937797,
      "kl": 0.510986328125,
      "learning_rate": 1e-06,
      "loss": 0.0031,
      "num_tokens": 12736760.0,
      "reward": 5.75,
      "reward_std": 0.4646196663379669,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.53125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9375,
      "step": 473
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 562.0,
      "completions/max_terminated_length": 562.0,
      "completions/mean_length": 322.0625,
      "completions/mean_terminated_length": 322.0625,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "epoch": 0.19307535641547863,
      "grad_norm": 0.19139405503932053,
      "kl": 0.6455078125,
      "learning_rate": 1e-06,
      "loss": 0.0089,
      "num_tokens": 12762074.0,
      "reward": 5.3671875,
      "reward_std": 1.0574750900268555,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.4375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8359375,
      "step": 474
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 797.0,
      "completions/max_terminated_length": 797.0,
      "completions/mean_length": 413.1875,
      "completions/mean_terminated_length": 413.1875,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "epoch": 0.1934826883910387,
      "grad_norm": 0.20173974979643558,
      "kl": 0.47998046875,
      "learning_rate": 1e-06,
      "loss": 0.008,
      "num_tokens": 12790776.0,
      "reward": 4.328125,
      "reward_std": 1.2768642902374268,
      "rewards/cargo_build_reward": 0.59375,
      "rewards/cargo_clippy_reward": 0.59375,
      "rewards/cargo_test_reward": 0.125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.890625,
      "step": 475
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1528.0,
      "completions/max_terminated_length": 1528.0,
      "completions/mean_length": 471.15625,
      "completions/mean_terminated_length": 471.15625,
      "completions/min_length": 193.0,
      "completions/min_terminated_length": 193.0,
      "epoch": 0.19389002036659878,
      "grad_norm": 0.19938856692223425,
      "kl": 0.365478515625,
      "learning_rate": 1e-06,
      "loss": 0.0005,
      "num_tokens": 12821157.0,
      "reward": 4.6640625,
      "reward_std": 1.1157841682434082,
      "rewards/cargo_build_reward": 0.71875,
      "rewards/cargo_clippy_reward": 0.71875,
      "rewards/cargo_test_reward": 0.15625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9140625,
      "step": 476
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 501.0,
      "completions/max_terminated_length": 501.0,
      "completions/mean_length": 302.9375,
      "completions/mean_terminated_length": 302.9375,
      "completions/min_length": 182.0,
      "completions/min_terminated_length": 182.0,
      "epoch": 0.19429735234215886,
      "grad_norm": 0.21634659107954318,
      "kl": 0.4873046875,
      "learning_rate": 1e-06,
      "loss": -0.0006,
      "num_tokens": 12846059.0,
      "reward": 6.2109375,
      "reward_std": 0.763170599937439,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.8125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7734375,
      "step": 477
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 471.0,
      "completions/max_terminated_length": 471.0,
      "completions/mean_length": 296.46875,
      "completions/mean_terminated_length": 295.93548387096774,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "epoch": 0.19470468431771895,
      "grad_norm": 0.15580036522512727,
      "kl": 1.927978515625,
      "learning_rate": 1e-06,
      "loss": 0.0022,
      "num_tokens": 12870202.0,
      "reward": 5.8203125,
      "reward_std": 1.0648064613342285,
      "rewards/cargo_build_reward": 0.96875,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.5,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9453125,
      "step": 478
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 675.0,
      "completions/max_terminated_length": 675.0,
      "completions/mean_length": 335.0625,
      "completions/mean_terminated_length": 335.0625,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "epoch": 0.195112016293279,
      "grad_norm": 0.14490128955032502,
      "kl": 0.397216796875,
      "learning_rate": 1e-06,
      "loss": -0.0028,
      "num_tokens": 12895668.0,
      "reward": 5.625,
      "reward_std": 0.9494472742080688,
      "rewards/cargo_build_reward": 0.9375,
      "rewards/cargo_clippy_reward": 0.9375,
      "rewards/cargo_test_reward": 0.4375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.875,
      "step": 479
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1056.0,
      "completions/max_terminated_length": 1056.0,
      "completions/mean_length": 481.28125,
      "completions/mean_terminated_length": 481.28125,
      "completions/min_length": 237.0,
      "completions/min_terminated_length": 237.0,
      "epoch": 0.1955193482688391,
      "grad_norm": 0.17211256383646323,
      "kl": 1.50634765625,
      "learning_rate": 1e-06,
      "loss": 0.003,
      "num_tokens": 12926229.0,
      "reward": 5.1640625,
      "reward_std": 0.5368491411209106,
      "rewards/cargo_build_reward": 0.625,
      "rewards/cargo_clippy_reward": 0.625,
      "rewards/cargo_test_reward": 0.5,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9140625,
      "step": 480
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 892.0,
      "completions/max_terminated_length": 892.0,
      "completions/mean_length": 445.125,
      "completions/mean_terminated_length": 445.125,
      "completions/min_length": 238.0,
      "completions/min_terminated_length": 238.0,
      "epoch": 0.1959266802443992,
      "grad_norm": 1.266372236336311,
      "kl": 6.962890625,
      "learning_rate": 1e-06,
      "loss": 0.0156,
      "num_tokens": 12956073.0,
      "reward": 5.34375,
      "reward_std": 1.0504249334335327,
      "rewards/cargo_build_reward": 0.9375,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.75,
      "step": 481
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 409.0,
      "completions/max_terminated_length": 409.0,
      "completions/mean_length": 278.125,
      "completions/mean_terminated_length": 278.125,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "epoch": 0.19633401221995928,
      "grad_norm": 0.2264456989202532,
      "kl": 1.4228515625,
      "learning_rate": 1e-06,
      "loss": -0.0021,
      "num_tokens": 12980477.0,
      "reward": 5.265625,
      "reward_std": 1.2569725513458252,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.828125,
      "step": 482
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 485.0,
      "completions/max_terminated_length": 485.0,
      "completions/mean_length": 289.28125,
      "completions/mean_terminated_length": 289.28125,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "epoch": 0.19674134419551934,
      "grad_norm": 0.1574338951932043,
      "kl": 0.398681640625,
      "learning_rate": 1e-06,
      "loss": -0.0012,
      "num_tokens": 13004566.0,
      "reward": 6.3125,
      "reward_std": 0.7749233841896057,
      "rewards/cargo_build_reward": 0.96875,
      "rewards/cargo_clippy_reward": 0.96875,
      "rewards/cargo_test_reward": 0.71875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9375,
      "step": 483
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 895.0,
      "completions/max_terminated_length": 895.0,
      "completions/mean_length": 391.21875,
      "completions/mean_terminated_length": 391.21875,
      "completions/min_length": 243.0,
      "completions/min_terminated_length": 243.0,
      "epoch": 0.19714867617107942,
      "grad_norm": 0.22265607253657393,
      "kl": 1.1279296875,
      "learning_rate": 1e-06,
      "loss": 0.0065,
      "num_tokens": 13032013.0,
      "reward": 5.5234375,
      "reward_std": 1.4083189964294434,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.53125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.7109375,
      "step": 484
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 537.0,
      "completions/max_terminated_length": 537.0,
      "completions/mean_length": 329.53125,
      "completions/mean_terminated_length": 329.53125,
      "completions/min_length": 181.0,
      "completions/min_terminated_length": 181.0,
      "epoch": 0.1975560081466395,
      "grad_norm": 0.16963911268826184,
      "kl": 0.841796875,
      "learning_rate": 1e-06,
      "loss": -0.0006,
      "num_tokens": 13058102.0,
      "reward": 5.75,
      "reward_std": 1.0953752994537354,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.875,
      "step": 485
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 596.0,
      "completions/max_terminated_length": 596.0,
      "completions/mean_length": 364.8125,
      "completions/mean_terminated_length": 364.8125,
      "completions/min_length": 189.0,
      "completions/min_terminated_length": 189.0,
      "epoch": 0.1979633401221996,
      "grad_norm": 0.1374298967851221,
      "kl": 0.864501953125,
      "learning_rate": 1e-06,
      "loss": -0.0006,
      "num_tokens": 13085072.0,
      "reward": 5.421875,
      "reward_std": 0.736834704875946,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.921875,
      "step": 486
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 492.0,
      "completions/max_terminated_length": 492.0,
      "completions/mean_length": 298.21875,
      "completions/mean_terminated_length": 298.21875,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "epoch": 0.1983706720977597,
      "grad_norm": 0.1261195087623572,
      "kl": 0.7607421875,
      "learning_rate": 1e-06,
      "loss": 0.0053,
      "num_tokens": 13109911.0,
      "reward": 5.546875,
      "reward_std": 0.8305887579917908,
      "rewards/cargo_build_reward": 0.96875,
      "rewards/cargo_clippy_reward": 0.96875,
      "rewards/cargo_test_reward": 0.375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.859375,
      "step": 487
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 473.0,
      "completions/max_terminated_length": 473.0,
      "completions/mean_length": 310.96875,
      "completions/mean_terminated_length": 310.96875,
      "completions/min_length": 201.0,
      "completions/min_terminated_length": 201.0,
      "epoch": 0.19877800407331975,
      "grad_norm": 0.20554491217342194,
      "kl": 0.95458984375,
      "learning_rate": 1e-06,
      "loss": 0.0038,
      "num_tokens": 13134190.0,
      "reward": 5.5390625,
      "reward_std": 0.9816341996192932,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.4375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9140625,
      "step": 488
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 554.0,
      "completions/max_terminated_length": 554.0,
      "completions/mean_length": 266.375,
      "completions/mean_terminated_length": 266.375,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "epoch": 0.19918533604887984,
      "grad_norm": 0.13393542281782445,
      "kl": 0.2066650390625,
      "learning_rate": 1e-06,
      "loss": -0.0014,
      "num_tokens": 13157778.0,
      "reward": 6.2578125,
      "reward_std": 0.9760429263114929,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.75,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9453125,
      "step": 489
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 701.0,
      "completions/max_terminated_length": 701.0,
      "completions/mean_length": 287.0625,
      "completions/mean_terminated_length": 287.0625,
      "completions/min_length": 197.0,
      "completions/min_terminated_length": 197.0,
      "epoch": 0.19959266802443992,
      "grad_norm": 0.14381180965090343,
      "kl": 0.569091796875,
      "learning_rate": 1e-06,
      "loss": 0.0085,
      "num_tokens": 13182436.0,
      "reward": 6.1875,
      "reward_std": 0.6123279333114624,
      "rewards/cargo_build_reward": 0.90625,
      "rewards/cargo_clippy_reward": 0.90625,
      "rewards/cargo_test_reward": 0.6875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 1.0,
      "step": 490
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 662.0,
      "completions/max_terminated_length": 662.0,
      "completions/mean_length": 372.40625,
      "completions/mean_terminated_length": 372.40625,
      "completions/min_length": 198.0,
      "completions/min_terminated_length": 198.0,
      "epoch": 0.2,
      "grad_norm": 0.20813935210236095,
      "kl": 0.9661865234375,
      "learning_rate": 1e-06,
      "loss": 0.0042,
      "num_tokens": 13209961.0,
      "reward": 5.609375,
      "reward_std": 1.2343088388442993,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.53125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.859375,
      "step": 491
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 492.0,
      "completions/max_terminated_length": 492.0,
      "completions/mean_length": 277.53125,
      "completions/mean_terminated_length": 277.53125,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "epoch": 0.20040733197556007,
      "grad_norm": 0.18185843577224053,
      "kl": 1.46142578125,
      "learning_rate": 1e-06,
      "loss": -0.0021,
      "num_tokens": 13233450.0,
      "reward": 6.0859375,
      "reward_std": 0.696807861328125,
      "rewards/cargo_build_reward": 1.0,
      "rewards/cargo_clippy_reward": 1.0,
      "rewards/cargo_test_reward": 0.59375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8984375,
      "step": 492
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 581.0,
      "completions/max_terminated_length": 581.0,
      "completions/mean_length": 360.34375,
      "completions/mean_terminated_length": 360.34375,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "epoch": 0.20081466395112016,
      "grad_norm": 0.18184364860984456,
      "kl": 2.14453125,
      "learning_rate": 1e-06,
      "loss": 0.0046,
      "num_tokens": 13260229.0,
      "reward": 6.1171875,
      "reward_std": 0.9113576412200928,
      "rewards/cargo_build_reward": 0.96875,
      "rewards/cargo_clippy_reward": 0.96875,
      "rewards/cargo_test_reward": 0.6875,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.8046875,
      "step": 493
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 748.0,
      "completions/max_terminated_length": 748.0,
      "completions/mean_length": 430.21875,
      "completions/mean_terminated_length": 430.21875,
      "completions/min_length": 196.0,
      "completions/min_terminated_length": 196.0,
      "epoch": 0.20122199592668025,
      "grad_norm": 0.3999569580272008,
      "kl": 3.3974609375,
      "learning_rate": 1e-06,
      "loss": -0.0024,
      "num_tokens": 13289452.0,
      "reward": 5.421875,
      "reward_std": 1.0172343254089355,
      "rewards/cargo_build_reward": 0.84375,
      "rewards/cargo_clippy_reward": 0.84375,
      "rewards/cargo_test_reward": 0.40625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.9375,
      "step": 494
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 533.0,
      "completions/max_terminated_length": 533.0,
      "completions/mean_length": 318.15625,
      "completions/mean_terminated_length": 318.15625,
      "completions/min_length": 201.0,
      "completions/min_terminated_length": 201.0,
      "epoch": 0.20162932790224034,
      "grad_norm": 0.18334239173462724,
      "kl": 0.400146484375,
      "learning_rate": 1e-06,
      "loss": 0.0032,
      "num_tokens": 13314457.0,
      "reward": 6.0,
      "reward_std": 1.0162653923034668,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 1.0,
      "step": 495
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 621.0,
      "completions/max_terminated_length": 621.0,
      "completions/mean_length": 319.96875,
      "completions/mean_terminated_length": 319.96875,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "epoch": 0.2020366598778004,
      "grad_norm": 0.45914975101690647,
      "kl": 3.718505859375,
      "learning_rate": 1e-06,
      "loss": -0.0052,
      "num_tokens": 13339552.0,
      "reward": 5.1484375,
      "reward_std": 1.227099895477295,
      "rewards/cargo_build_reward": 0.8125,
      "rewards/cargo_clippy_reward": 0.8125,
      "rewards/cargo_test_reward": 0.34375,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 0.96875,
      "rewards/tests_have_asserts_reward": 0.8515625,
      "step": 496
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 439.0,
      "completions/max_terminated_length": 439.0,
      "completions/mean_length": 265.78125,
      "completions/mean_terminated_length": 265.78125,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "epoch": 0.20244399185336048,
      "grad_norm": 0.1617705252962688,
      "kl": 2.36474609375,
      "learning_rate": 1e-06,
      "loss": -0.0001,
      "num_tokens": 13362441.0,
      "reward": 6.2109375,
      "reward_std": 0.4671742618083954,
      "rewards/cargo_build_reward": 1.0,
      "rewards/cargo_clippy_reward": 1.0,
      "rewards/cargo_test_reward": 0.625,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.9609375,
      "step": 497
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 342.0,
      "completions/max_terminated_length": 342.0,
      "completions/mean_length": 211.625,
      "completions/mean_terminated_length": 211.625,
      "completions/min_length": 77.0,
      "completions/min_terminated_length": 77.0,
      "epoch": 0.20285132382892057,
      "grad_norm": 0.05636492303275045,
      "kl": 0.39111328125,
      "learning_rate": 1e-06,
      "loss": -0.0009,
      "num_tokens": 13382925.0,
      "reward": 6.2578125,
      "reward_std": 0.4500587284564972,
      "rewards/cargo_build_reward": 1.0,
      "rewards/cargo_clippy_reward": 1.0,
      "rewards/cargo_test_reward": 0.78125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.6953125,
      "step": 498
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 614.0,
      "completions/max_terminated_length": 614.0,
      "completions/mean_length": 371.53125,
      "completions/mean_terminated_length": 371.53125,
      "completions/min_length": 202.0,
      "completions/min_terminated_length": 202.0,
      "epoch": 0.20325865580448066,
      "grad_norm": 0.2578517900980799,
      "kl": 1.40234375,
      "learning_rate": 1e-06,
      "loss": 0.0099,
      "num_tokens": 13409670.0,
      "reward": 4.921875,
      "reward_std": 1.2841131687164307,
      "rewards/cargo_build_reward": 0.75,
      "rewards/cargo_clippy_reward": 0.75,
      "rewards/cargo_test_reward": 0.28125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.859375,
      "step": 499
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 984.0,
      "completions/max_terminated_length": 984.0,
      "completions/mean_length": 366.71875,
      "completions/mean_terminated_length": 366.71875,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "epoch": 0.20366598778004075,
      "grad_norm": 0.22751045945985918,
      "kl": 0.599609375,
      "learning_rate": 1e-06,
      "loss": 0.0091,
      "num_tokens": 13435773.0,
      "reward": 5.546875,
      "reward_std": 1.4063599109649658,
      "rewards/cargo_build_reward": 0.875,
      "rewards/cargo_clippy_reward": 0.875,
      "rewards/cargo_test_reward": 0.53125,
      "rewards/code_block_count_reward": 1.0,
      "rewards/non_empty_reward": 1.0,
      "rewards/test_block_count_reward": 1.0,
      "rewards/tests_have_asserts_reward": 0.734375,
      "step": 500
    }
  ],
  "logging_steps": 1,
  "max_steps": 500,
  "num_input_tokens_seen": 13435773,
  "num_train_epochs": 1,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}