{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.20366598778004075, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1371.0, "completions/max_terminated_length": 1371.0, "completions/mean_length": 454.40625, "completions/mean_terminated_length": 454.40625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.0004073319755600815, "grad_norm": 0.1175210377671042, "kl": 9.036064147949219e-05, "learning_rate": 0.0, "loss": -0.0186, "num_tokens": 29437.0, "reward": 3.1875, "reward_std": 1.1310166120529175, "rewards/cargo_build_reward": 0.4375, "rewards/cargo_clippy_reward": 0.4375, "rewards/cargo_test_reward": 0.0625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.6875, "rewards/tests_have_asserts_reward": 0.34375, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 371.28125, "completions/mean_terminated_length": 383.258064516129, "completions/min_length": 0.0, "completions/min_terminated_length": 99.0, "epoch": 0.000814663951120163, "grad_norm": 0.132212480221256, "kl": 8.082389831542969e-05, "learning_rate": 1e-07, "loss": -0.0175, "num_tokens": 60654.0, "reward": 3.484375, "reward_std": 1.3426176309585571, "rewards/cargo_build_reward": 0.5, "rewards/cargo_clippy_reward": 0.46875, "rewards/cargo_test_reward": 0.09375, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.90625, "rewards/test_block_count_reward": 0.8125, "rewards/tests_have_asserts_reward": 0.53125, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 213.21875, "completions/mean_terminated_length": 213.21875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.0012219959266802445, "grad_norm": 0.12912013288705376, "kl": 0.0001004934310913086, "learning_rate": 2e-07, "loss": -0.0111, "num_tokens": 82117.0, "reward": 4.421875, "reward_std": 1.8205678462982178, "rewards/cargo_build_reward": 0.6875, "rewards/cargo_clippy_reward": 0.6875, "rewards/cargo_test_reward": 0.3125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 0.9375, "rewards/test_block_count_reward": 0.8125, "rewards/tests_have_asserts_reward": 0.578125, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 303.4375, "completions/mean_terminated_length": 303.4375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.001629327902240326, "grad_norm": 0.12375396685149824, "kl": 0.00011289119720458984, "learning_rate": 3e-07, "loss": -0.0151, "num_tokens": 106875.0, "reward": 4.046875, "reward_std": 1.3957082033157349, "rewards/cargo_build_reward": 0.6875, "rewards/cargo_clippy_reward": 0.6875, "rewards/cargo_test_reward": 0.21875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 0.875, "rewards/test_block_count_reward": 0.71875, "rewards/tests_have_asserts_reward": 0.5, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 367.9375, "completions/mean_terminated_length": 367.9375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.002036659877800407, "grad_norm": 0.18338271433197806, "kl": 9.357929229736328e-05, "learning_rate": 4e-07, "loss": -0.0004, "num_tokens": 133297.0, "reward": 4.5859375, "reward_std": 1.7285411357879639, "rewards/cargo_build_reward": 0.5625, "rewards/cargo_clippy_reward": 0.5625, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.9375, "rewards/tests_have_asserts_reward": 0.8046875, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 977.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 286.1875, "completions/mean_terminated_length": 286.1875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.002443991853360489, "grad_norm": 0.1225919146096492, "kl": 0.00015878677368164062, "learning_rate": 5e-07, "loss": -0.0025, "num_tokens": 156767.0, "reward": 3.5859375, "reward_std": 1.216860055923462, "rewards/cargo_build_reward": 0.46875, "rewards/cargo_clippy_reward": 0.46875, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 0.875, "rewards/test_block_count_reward": 0.75, "rewards/tests_have_asserts_reward": 0.3359375, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 220.15625, "completions/mean_terminated_length": 220.15625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.0028513238289205704, "grad_norm": 0.1586147171757847, "kl": 0.00019216537475585938, "learning_rate": 6e-07, "loss": -0.0151, "num_tokens": 178516.0, "reward": 3.890625, "reward_std": 1.9172019958496094, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.21875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 0.875, "rewards/test_block_count_reward": 0.625, "rewards/tests_have_asserts_reward": 0.453125, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1825.0, "completions/max_terminated_length": 1825.0, "completions/mean_length": 473.34375, "completions/mean_terminated_length": 473.34375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.003258655804480652, "grad_norm": 0.0793017248205778, "kl": 9.167194366455078e-05, "learning_rate": 7e-07, "loss": -0.004, "num_tokens": 209255.0, "reward": 3.96875, "reward_std": 0.6764461398124695, "rewards/cargo_build_reward": 0.4375, "rewards/cargo_clippy_reward": 0.4375, "rewards/cargo_test_reward": 0.21875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.875, "rewards/tests_have_asserts_reward": 0.71875, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 317.90625, "completions/mean_terminated_length": 317.90625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.0036659877800407333, "grad_norm": 0.1202053154821433, "kl": 0.00035762786865234375, "learning_rate": 8e-07, "loss": -0.0066, "num_tokens": 234740.0, "reward": 3.4296875, "reward_std": 1.3169221878051758, "rewards/cargo_build_reward": 0.53125, "rewards/cargo_clippy_reward": 0.53125, "rewards/cargo_test_reward": 0.15625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 0.9375, "rewards/test_block_count_reward": 0.625, "rewards/tests_have_asserts_reward": 0.3046875, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1046.0, "completions/max_terminated_length": 1046.0, "completions/mean_length": 334.8125, "completions/mean_terminated_length": 334.8125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.004073319755600814, "grad_norm": 0.12952392522902093, "kl": 0.0002543926239013672, "learning_rate": 9e-07, "loss": -0.017, "num_tokens": 260822.0, "reward": 3.375, "reward_std": 1.4019556045532227, "rewards/cargo_build_reward": 0.4375, "rewards/cargo_clippy_reward": 0.4375, "rewards/cargo_test_reward": 0.125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.6875, "rewards/tests_have_asserts_reward": 0.40625, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1046.0, "completions/max_terminated_length": 1046.0, "completions/mean_length": 430.3125, "completions/mean_terminated_length": 430.3125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.004480651731160896, "grad_norm": 0.09429395732660069, "kl": 0.0001468658447265625, "learning_rate": 1e-06, "loss": -0.011, "num_tokens": 289576.0, "reward": 3.1796875, "reward_std": 0.8280896544456482, "rewards/cargo_build_reward": 0.375, "rewards/cargo_clippy_reward": 0.375, "rewards/cargo_test_reward": 0.0, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.84375, "rewards/tests_have_asserts_reward": 0.5390625, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 261.15625, "completions/mean_terminated_length": 261.15625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.004887983706720978, "grad_norm": 0.11577175478294834, "kl": 0.0003638267517089844, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 312557.0, "reward": 4.9453125, "reward_std": 1.4855623245239258, "rewards/cargo_build_reward": 0.71875, "rewards/cargo_clippy_reward": 0.71875, "rewards/cargo_test_reward": 0.4375, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.9375, "rewards/tests_have_asserts_reward": 0.7109375, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 269.65625, "completions/mean_terminated_length": 269.65625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.005295315682281059, "grad_norm": 0.14930968488555413, "kl": 0.00039386749267578125, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 336546.0, "reward": 4.2109375, "reward_std": 1.5795735120773315, "rewards/cargo_build_reward": 0.71875, "rewards/cargo_clippy_reward": 0.71875, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.75, "rewards/tests_have_asserts_reward": 0.3671875, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 817.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 319.6875, "completions/mean_terminated_length": 319.6875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.005702647657841141, "grad_norm": 0.1382750920092372, "kl": 0.00045013427734375, "learning_rate": 1e-06, "loss": -0.0119, "num_tokens": 361824.0, "reward": 3.90625, "reward_std": 1.2457867860794067, "rewards/cargo_build_reward": 0.46875, "rewards/cargo_clippy_reward": 0.46875, "rewards/cargo_test_reward": 0.25, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 0.9375, "rewards/test_block_count_reward": 0.84375, "rewards/tests_have_asserts_reward": 0.609375, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 315.90625, "completions/mean_terminated_length": 315.90625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.006109979633401222, "grad_norm": 0.10016667125488203, "kl": 0.0008392333984375, "learning_rate": 1e-06, "loss": -0.0045, "num_tokens": 387285.0, "reward": 3.7734375, "reward_std": 0.9530363082885742, "rewards/cargo_build_reward": 0.53125, "rewards/cargo_clippy_reward": 0.53125, "rewards/cargo_test_reward": 0.21875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.75, "rewards/tests_have_asserts_reward": 0.3984375, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 285.28125, "completions/mean_terminated_length": 285.28125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.006517311608961304, "grad_norm": 0.13715469155008012, "kl": 0.0009860992431640625, "learning_rate": 1e-06, "loss": 0.0053, "num_tokens": 411862.0, "reward": 4.03125, "reward_std": 1.5449090003967285, "rewards/cargo_build_reward": 0.625, "rewards/cargo_clippy_reward": 0.625, "rewards/cargo_test_reward": 0.25, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.84375, "rewards/tests_have_asserts_reward": 0.40625, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 267.75, "completions/mean_terminated_length": 267.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.006924643584521385, "grad_norm": 0.0866997914412378, "kl": 0.0011415481567382812, "learning_rate": 1e-06, "loss": -0.002, "num_tokens": 436118.0, "reward": 3.5703125, "reward_std": 1.0962092876434326, "rewards/cargo_build_reward": 0.5, "rewards/cargo_clippy_reward": 0.5, "rewards/cargo_test_reward": 0.09375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 0.84375, "rewards/test_block_count_reward": 0.8125, "rewards/tests_have_asserts_reward": 0.6328125, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 220.625, "completions/mean_terminated_length": 220.625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.007331975560081467, "grad_norm": 0.10514629633326943, "kl": 0.001956939697265625, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 458362.0, "reward": 4.6796875, "reward_std": 1.4177031517028809, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.40625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.875, "rewards/tests_have_asserts_reward": 0.6484375, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 398.3125, "completions/mean_terminated_length": 398.3125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.007739307535641548, "grad_norm": 0.11106316659149937, "kl": 0.001277923583984375, "learning_rate": 1e-06, "loss": -0.0067, "num_tokens": 485228.0, "reward": 3.671875, "reward_std": 0.9971106052398682, "rewards/cargo_build_reward": 0.5, "rewards/cargo_clippy_reward": 0.5, "rewards/cargo_test_reward": 0.0, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.90625, "rewards/tests_have_asserts_reward": 0.71875, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 363.21875, "completions/mean_terminated_length": 363.21875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.008146639511201629, "grad_norm": 0.09132485848606724, "kl": 0.0019779205322265625, "learning_rate": 1e-06, "loss": -0.0124, "num_tokens": 511587.0, "reward": 4.0234375, "reward_std": 0.9875117540359497, "rewards/cargo_build_reward": 0.5, "rewards/cargo_clippy_reward": 0.5, "rewards/cargo_test_reward": 0.21875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.9375, "rewards/tests_have_asserts_reward": 0.6484375, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 350.71875, "completions/mean_terminated_length": 362.03225806451616, "completions/min_length": 0.0, "completions/min_terminated_length": 170.0, "epoch": 0.008553971486761711, "grad_norm": 0.10923375708807619, "kl": 0.002239227294921875, "learning_rate": 1e-06, "loss": -0.0095, "num_tokens": 541231.0, "reward": 3.4765625, "reward_std": 0.9791522026062012, "rewards/cargo_build_reward": 0.375, "rewards/cargo_clippy_reward": 0.375, "rewards/cargo_test_reward": 0.21875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.8125, "rewards/tests_have_asserts_reward": 0.3828125, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 339.5, "completions/mean_terminated_length": 339.5, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.008961303462321792, "grad_norm": 0.11906376751591456, "kl": 0.002227783203125, "learning_rate": 1e-06, "loss": 0.0048, "num_tokens": 566959.0, "reward": 5.0703125, "reward_std": 1.1388654708862305, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.46875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8203125, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 373.84375, "completions/mean_terminated_length": 373.84375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.009368635437881873, "grad_norm": 0.11868349885586216, "kl": 0.0027923583984375, "learning_rate": 1e-06, "loss": -0.0035, "num_tokens": 593706.0, "reward": 3.890625, "reward_std": 1.3067610263824463, "rewards/cargo_build_reward": 0.59375, "rewards/cargo_clippy_reward": 0.59375, "rewards/cargo_test_reward": 0.1875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 0.9375, "rewards/test_block_count_reward": 0.84375, "rewards/tests_have_asserts_reward": 0.46875, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 797.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 379.25, "completions/mean_terminated_length": 404.53333333333336, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 0.009775967413441956, "grad_norm": 0.14299534105777312, "kl": 0.00243377685546875, "learning_rate": 1e-06, "loss": -0.0318, "num_tokens": 628716.0, "reward": 3.53125, "reward_std": 1.060036540031433, "rewards/cargo_build_reward": 0.4375, "rewards/cargo_clippy_reward": 0.4375, "rewards/cargo_test_reward": 0.09375, "rewards/code_block_count_reward": 0.9375, "rewards/non_empty_reward": 0.9375, "rewards/test_block_count_reward": 0.9375, "rewards/tests_have_asserts_reward": 0.59375, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 281.03125, "completions/mean_terminated_length": 281.03125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.010183299389002037, "grad_norm": 0.06393125755602662, "kl": 0.0063934326171875, "learning_rate": 1e-06, "loss": -0.002, "num_tokens": 652525.0, "reward": 3.21875, "reward_std": 0.6925716400146484, "rewards/cargo_build_reward": 0.25, "rewards/cargo_clippy_reward": 0.25, "rewards/cargo_test_reward": 0.0625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.90625, "rewards/tests_have_asserts_reward": 0.640625, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 283.5, "completions/mean_terminated_length": 283.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.010590631364562118, "grad_norm": 0.19607205581028808, "kl": 0.0119476318359375, "learning_rate": 1e-06, "loss": -0.0135, "num_tokens": 677125.0, "reward": 3.9921875, "reward_std": 2.011322259902954, "rewards/cargo_build_reward": 0.6875, "rewards/cargo_clippy_reward": 0.6875, "rewards/cargo_test_reward": 0.40625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 0.84375, "rewards/test_block_count_reward": 0.59375, "rewards/tests_have_asserts_reward": 0.1640625, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 276.71875, "completions/mean_terminated_length": 276.71875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.010997963340122199, "grad_norm": 0.12244939393928782, "kl": 0.0082244873046875, "learning_rate": 1e-06, "loss": -0.0051, "num_tokens": 701276.0, "reward": 4.4453125, "reward_std": 1.0721098184585571, "rewards/cargo_build_reward": 0.71875, "rewards/cargo_clippy_reward": 0.71875, "rewards/cargo_test_reward": 0.25, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.9375, "rewards/tests_have_asserts_reward": 0.5703125, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1031.0, "completions/max_terminated_length": 1031.0, "completions/mean_length": 340.6875, "completions/mean_terminated_length": 340.6875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.011405295315682282, "grad_norm": 0.11316959025254554, "kl": 0.0091094970703125, "learning_rate": 1e-06, "loss": -0.0098, "num_tokens": 727458.0, "reward": 4.03125, "reward_std": 1.1808925867080688, "rewards/cargo_build_reward": 0.5, "rewards/cargo_clippy_reward": 0.5, "rewards/cargo_test_reward": 0.3125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.8125, "rewards/tests_have_asserts_reward": 0.5, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 351.28125, "completions/mean_terminated_length": 351.28125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.011812627291242363, "grad_norm": 0.11011504762441725, "kl": 0.01514434814453125, "learning_rate": 1e-06, "loss": -0.0062, "num_tokens": 753907.0, "reward": 4.3046875, "reward_std": 1.0233951807022095, "rewards/cargo_build_reward": 0.625, "rewards/cargo_clippy_reward": 0.625, "rewards/cargo_test_reward": 0.21875, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.84375, "rewards/tests_have_asserts_reward": 0.7421875, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 339.5625, "completions/mean_terminated_length": 339.5625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.012219959266802444, "grad_norm": 0.0720316910153896, "kl": 0.01592254638671875, "learning_rate": 1e-06, "loss": -0.0011, "num_tokens": 779765.0, "reward": 3.6484375, "reward_std": 0.7949134111404419, "rewards/cargo_build_reward": 0.53125, "rewards/cargo_clippy_reward": 0.53125, "rewards/cargo_test_reward": 0.0, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 0.9375, "rewards/test_block_count_reward": 0.90625, "rewards/tests_have_asserts_reward": 0.6953125, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 313.59375, "completions/mean_terminated_length": 313.59375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.012627291242362525, "grad_norm": 0.1388954396991544, "kl": 0.0106353759765625, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 804544.0, "reward": 4.3671875, "reward_std": 1.3792965412139893, "rewards/cargo_build_reward": 0.71875, "rewards/cargo_clippy_reward": 0.71875, "rewards/cargo_test_reward": 0.21875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.9375, "rewards/tests_have_asserts_reward": 0.5234375, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2822.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 419.40625, "completions/mean_terminated_length": 341.9032258064516, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.013034623217922607, "grad_norm": 0.1324050390267588, "kl": 0.00836944580078125, "learning_rate": 1e-06, "loss": 0.0489, "num_tokens": 832677.0, "reward": 3.71875, "reward_std": 1.5190155506134033, "rewards/cargo_build_reward": 0.40625, "rewards/cargo_clippy_reward": 0.40625, "rewards/cargo_test_reward": 0.15625, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.9375, "rewards/tests_have_asserts_reward": 0.671875, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 307.90625, "completions/mean_terminated_length": 307.90625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.013441955193482688, "grad_norm": 0.08163626831475472, "kl": 0.0209808349609375, "learning_rate": 1e-06, "loss": -0.0067, "num_tokens": 857042.0, "reward": 4.515625, "reward_std": 1.0991976261138916, "rewards/cargo_build_reward": 0.5, "rewards/cargo_clippy_reward": 0.5, "rewards/cargo_test_reward": 0.40625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.9375, "rewards/tests_have_asserts_reward": 0.765625, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 896.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 392.40625, "completions/mean_terminated_length": 392.40625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.01384928716904277, "grad_norm": 0.12031590107086831, "kl": 0.01251220703125, "learning_rate": 1e-06, "loss": 0.0063, "num_tokens": 884423.0, "reward": 3.59375, "reward_std": 1.2008159160614014, "rewards/cargo_build_reward": 0.5, "rewards/cargo_clippy_reward": 0.5, "rewards/cargo_test_reward": 0.125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.90625, "rewards/tests_have_asserts_reward": 0.390625, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 306.3125, "completions/mean_terminated_length": 306.3125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.014256619144602852, "grad_norm": 0.1399342572201649, "kl": 0.0334320068359375, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 909393.0, "reward": 4.5625, "reward_std": 1.0657768249511719, "rewards/cargo_build_reward": 0.6875, "rewards/cargo_clippy_reward": 0.6875, "rewards/cargo_test_reward": 0.40625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.90625, "rewards/tests_have_asserts_reward": 0.453125, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 463.9375, "completions/mean_terminated_length": 463.9375, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.014663951120162933, "grad_norm": 0.1078256076237383, "kl": 0.0101165771484375, "learning_rate": 1e-06, "loss": 0.0049, "num_tokens": 939071.0, "reward": 3.4140625, "reward_std": 0.945007860660553, "rewards/cargo_build_reward": 0.34375, "rewards/cargo_clippy_reward": 0.34375, "rewards/cargo_test_reward": 0.0, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.9375, "rewards/tests_have_asserts_reward": 0.7578125, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 342.4375, "completions/mean_terminated_length": 342.4375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.015071283095723014, "grad_norm": 0.11291825337657914, "kl": 0.0157623291015625, "learning_rate": 1e-06, "loss": 0.0037, "num_tokens": 964677.0, "reward": 4.2578125, "reward_std": 1.248620629310608, "rewards/cargo_build_reward": 0.53125, "rewards/cargo_clippy_reward": 0.53125, "rewards/cargo_test_reward": 0.1875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.8359375, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1294.0, "completions/max_terminated_length": 1294.0, "completions/mean_length": 531.9375, "completions/mean_terminated_length": 531.9375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.015478615071283095, "grad_norm": 0.09047481558569173, "kl": 0.0214385986328125, "learning_rate": 1e-06, "loss": -0.0087, "num_tokens": 997251.0, "reward": 3.078125, "reward_std": 0.8194274306297302, "rewards/cargo_build_reward": 0.25, "rewards/cargo_clippy_reward": 0.25, "rewards/cargo_test_reward": 0.0, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.9375, "rewards/tests_have_asserts_reward": 0.609375, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 335.34375, "completions/mean_terminated_length": 346.16129032258067, "completions/min_length": 0.0, "completions/min_terminated_length": 143.0, "epoch": 0.015885947046843176, "grad_norm": 0.12381201903810035, "kl": 0.0168304443359375, "learning_rate": 1e-06, "loss": -0.0139, "num_tokens": 1026833.0, "reward": 3.8515625, "reward_std": 1.401503086090088, "rewards/cargo_build_reward": 0.5, "rewards/cargo_clippy_reward": 0.5, "rewards/cargo_test_reward": 0.125, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.9375, "rewards/tests_have_asserts_reward": 0.6796875, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 352.8125, "completions/mean_terminated_length": 352.8125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.016293279022403257, "grad_norm": 0.13170498818621618, "kl": 0.0164947509765625, "learning_rate": 1e-06, "loss": 0.003, "num_tokens": 1053755.0, "reward": 4.6640625, "reward_std": 1.2833765745162964, "rewards/cargo_build_reward": 0.71875, "rewards/cargo_clippy_reward": 0.71875, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.6640625, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 323.5625, "completions/mean_terminated_length": 334.0, "completions/min_length": 0.0, "completions/min_terminated_length": 111.0, "epoch": 0.01670061099796334, "grad_norm": 0.10743206140354492, "kl": 0.021087646484375, "learning_rate": 1e-06, "loss": -0.0139, "num_tokens": 1083248.0, "reward": 3.3828125, "reward_std": 1.0023860931396484, "rewards/cargo_build_reward": 0.4375, "rewards/cargo_clippy_reward": 0.4375, "rewards/cargo_test_reward": 0.0, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.9375, "rewards/tests_have_asserts_reward": 0.5859375, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/max_terminated_length": 644.0, "completions/mean_length": 413.8125, "completions/mean_terminated_length": 413.8125, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.017107942973523423, "grad_norm": 0.10643379964820185, "kl": 0.015838623046875, "learning_rate": 1e-06, "loss": 0.0031, "num_tokens": 1111402.0, "reward": 4.0625, "reward_std": 1.0337579250335693, "rewards/cargo_build_reward": 0.59375, "rewards/cargo_clippy_reward": 0.59375, "rewards/cargo_test_reward": 0.1875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.5, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 358.9375, "completions/mean_terminated_length": 358.9375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.017515274949083504, "grad_norm": 0.11035849866551167, "kl": 0.18438720703125, "learning_rate": 1e-06, "loss": -0.0102, "num_tokens": 1137288.0, "reward": 3.390625, "reward_std": 0.7923446893692017, "rewards/cargo_build_reward": 0.40625, "rewards/cargo_clippy_reward": 0.40625, "rewards/cargo_test_reward": 0.03125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 0.90625, "rewards/test_block_count_reward": 0.9375, "rewards/tests_have_asserts_reward": 0.640625, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 859.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 349.5, "completions/mean_terminated_length": 349.5, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.017922606924643585, "grad_norm": 0.13138570856112985, "kl": 0.02716064453125, "learning_rate": 1e-06, "loss": 0.0171, "num_tokens": 1163224.0, "reward": 5.046875, "reward_std": 1.3160395622253418, "rewards/cargo_build_reward": 0.75, "rewards/cargo_clippy_reward": 0.75, "rewards/cargo_test_reward": 0.5, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.5625, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1063.0, "completions/max_terminated_length": 1063.0, "completions/mean_length": 451.375, "completions/mean_terminated_length": 451.375, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.018329938900203666, "grad_norm": 0.12717083019014958, "kl": 0.014984130859375, "learning_rate": 1e-06, "loss": 0.0054, "num_tokens": 1192716.0, "reward": 3.5625, "reward_std": 1.2201060056686401, "rewards/cargo_build_reward": 0.40625, "rewards/cargo_clippy_reward": 0.40625, "rewards/cargo_test_reward": 0.09375, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.625, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 375.59375, "completions/mean_terminated_length": 375.59375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.018737270875763747, "grad_norm": 0.11688640840387661, "kl": 0.085601806640625, "learning_rate": 1e-06, "loss": 0.002, "num_tokens": 1219279.0, "reward": 3.8203125, "reward_std": 1.2091054916381836, "rewards/cargo_build_reward": 0.4375, "rewards/cargo_clippy_reward": 0.4375, "rewards/cargo_test_reward": 0.15625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.6796875, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 363.3125, "completions/mean_terminated_length": 363.3125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.019144602851323828, "grad_norm": 0.1207956799753875, "kl": 0.024078369140625, "learning_rate": 1e-06, "loss": 0.004, "num_tokens": 1246617.0, "reward": 3.953125, "reward_std": 1.2309598922729492, "rewards/cargo_build_reward": 0.46875, "rewards/cargo_clippy_reward": 0.46875, "rewards/cargo_test_reward": 0.15625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.703125, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 325.125, "completions/mean_terminated_length": 325.125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.019551934826883912, "grad_norm": 0.10192110556876445, "kl": 0.027984619140625, "learning_rate": 1e-06, "loss": -0.0066, "num_tokens": 1272621.0, "reward": 4.96875, "reward_std": 1.1239038705825806, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.65625, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 368.28125, "completions/mean_terminated_length": 368.28125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.019959266802443993, "grad_norm": 0.09942170746232587, "kl": 0.06756591796875, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 1299686.0, "reward": 4.1171875, "reward_std": 1.0894885063171387, "rewards/cargo_build_reward": 0.59375, "rewards/cargo_clippy_reward": 0.59375, "rewards/cargo_test_reward": 0.15625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.6640625, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 292.25, "completions/mean_terminated_length": 292.25, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.020366598778004074, "grad_norm": 0.07022626825674544, "kl": 0.034515380859375, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 1323846.0, "reward": 4.515625, "reward_std": 0.8746212124824524, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.21875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.765625, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1590.0, "completions/max_terminated_length": 1590.0, "completions/mean_length": 393.28125, "completions/mean_terminated_length": 405.96774193548384, "completions/min_length": 0.0, "completions/min_terminated_length": 139.0, "epoch": 0.020773930753564155, "grad_norm": 0.17307638223220873, "kl": 0.02642822265625, "learning_rate": 1e-06, "loss": -0.0037, "num_tokens": 1355344.0, "reward": 4.53125, "reward_std": 0.9753614068031311, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.4375, "rewards/code_block_count_reward": 0.9375, "rewards/non_empty_reward": 0.90625, "rewards/test_block_count_reward": 0.875, "rewards/tests_have_asserts_reward": 0.53125, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 276.875, "completions/mean_terminated_length": 285.80645161290323, "completions/min_length": 0.0, "completions/min_terminated_length": 117.0, "epoch": 0.021181262729124236, "grad_norm": 0.1257849939448993, "kl": 0.03900146484375, "learning_rate": 1e-06, "loss": -0.0143, "num_tokens": 1382811.0, "reward": 3.8359375, "reward_std": 1.3329918384552002, "rewards/cargo_build_reward": 0.46875, "rewards/cargo_clippy_reward": 0.46875, "rewards/cargo_test_reward": 0.15625, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.9375, "rewards/tests_have_asserts_reward": 0.6640625, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1148.0, "completions/max_terminated_length": 1148.0, "completions/mean_length": 391.5625, "completions/mean_terminated_length": 391.5625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.021588594704684317, "grad_norm": 0.11213828920245573, "kl": 0.1412353515625, "learning_rate": 1e-06, "loss": -0.0029, "num_tokens": 1409941.0, "reward": 4.2578125, "reward_std": 0.9867215156555176, "rewards/cargo_build_reward": 0.6875, "rewards/cargo_clippy_reward": 0.6875, "rewards/cargo_test_reward": 0.0625, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.8203125, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 799.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 325.96875, "completions/mean_terminated_length": 325.96875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.021995926680244398, "grad_norm": 0.07234987825205394, "kl": 0.03948974609375, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 1435180.0, "reward": 4.9609375, "reward_std": 0.8981945514678955, "rewards/cargo_build_reward": 0.625, "rewards/cargo_clippy_reward": 0.625, "rewards/cargo_test_reward": 0.46875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7734375, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 279.0625, "completions/mean_terminated_length": 279.0625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.02240325865580448, "grad_norm": 0.14348224459256667, "kl": 0.193603515625, "learning_rate": 1e-06, "loss": -0.0013, "num_tokens": 1459278.0, "reward": 4.4140625, "reward_std": 1.7242329120635986, "rewards/cargo_build_reward": 0.6875, "rewards/cargo_clippy_reward": 0.6875, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.9375, "rewards/tests_have_asserts_reward": 0.5546875, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 312.8125, "completions/mean_terminated_length": 312.8125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.022810590631364563, "grad_norm": 0.1167305580985666, "kl": 0.04241943359375, "learning_rate": 1e-06, "loss": 0.0031, "num_tokens": 1484160.0, "reward": 4.984375, "reward_std": 1.2556254863739014, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.78125, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.625, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 363.09375, "completions/mean_terminated_length": 363.09375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.023217922606924644, "grad_norm": 0.09973532180599194, "kl": 0.038330078125, "learning_rate": 1e-06, "loss": -0.0025, "num_tokens": 1511323.0, "reward": 3.6171875, "reward_std": 1.0292447805404663, "rewards/cargo_build_reward": 0.4375, "rewards/cargo_clippy_reward": 0.4375, "rewards/cargo_test_reward": 0.125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.4921875, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 734.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 393.6875, "completions/mean_terminated_length": 393.6875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.023625254582484725, "grad_norm": 0.1346776199533448, "kl": 0.03607177734375, "learning_rate": 1e-06, "loss": -0.0115, "num_tokens": 1539721.0, "reward": 4.3359375, "reward_std": 1.3484569787979126, "rewards/cargo_build_reward": 0.59375, "rewards/cargo_clippy_reward": 0.59375, "rewards/cargo_test_reward": 0.15625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8359375, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 331.34375, "completions/mean_terminated_length": 331.34375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.024032586558044806, "grad_norm": 0.13504668058649946, "kl": 0.0440673828125, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 1565284.0, "reward": 4.7578125, "reward_std": 1.4678330421447754, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.3125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8203125, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 401.125, "completions/mean_terminated_length": 401.125, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.024439918533604887, "grad_norm": 0.1234455225844031, "kl": 0.037353515625, "learning_rate": 1e-06, "loss": -0.0023, "num_tokens": 1593368.0, "reward": 4.0078125, "reward_std": 1.077244520187378, "rewards/cargo_build_reward": 0.53125, "rewards/cargo_clippy_reward": 0.53125, "rewards/cargo_test_reward": 0.09375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7578125, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 328.4375, "completions/mean_terminated_length": 328.4375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.02484725050916497, "grad_norm": 0.08762858795609393, "kl": 0.04791259765625, "learning_rate": 1e-06, "loss": -0.0024, "num_tokens": 1619062.0, "reward": 4.90625, "reward_std": 0.9854581952095032, "rewards/cargo_build_reward": 0.71875, "rewards/cargo_clippy_reward": 0.71875, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.78125, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 309.78125, "completions/mean_terminated_length": 309.78125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.02525458248472505, "grad_norm": 0.09003120864656787, "kl": 0.052978515625, "learning_rate": 1e-06, "loss": 0.0065, "num_tokens": 1643903.0, "reward": 5.5546875, "reward_std": 1.0079970359802246, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.5625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7421875, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 370.09375, "completions/mean_terminated_length": 370.09375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.025661914460285134, "grad_norm": 0.10777385978776224, "kl": 0.0457763671875, "learning_rate": 1e-06, "loss": 0.0071, "num_tokens": 1671162.0, "reward": 4.6796875, "reward_std": 1.262016773223877, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.6796875, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 269.25, "completions/mean_terminated_length": 269.25, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.026069246435845215, "grad_norm": 0.11339301063035911, "kl": 0.05950927734375, "learning_rate": 1e-06, "loss": -0.0017, "num_tokens": 1694594.0, "reward": 4.90625, "reward_std": 1.3360557556152344, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.609375, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1028.0, "completions/max_terminated_length": 1028.0, "completions/mean_length": 496.65625, "completions/mean_terminated_length": 496.65625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.026476578411405296, "grad_norm": 0.12964564093106987, "kl": 0.0882568359375, "learning_rate": 1e-06, "loss": -0.003, "num_tokens": 1725039.0, "reward": 3.1875, "reward_std": 0.9519739151000977, "rewards/cargo_build_reward": 0.1875, "rewards/cargo_clippy_reward": 0.1875, "rewards/cargo_test_reward": 0.0625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.6875, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3610.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 467.78125, "completions/mean_terminated_length": 366.4193548387097, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.026883910386965377, "grad_norm": 0.10976050477139662, "kl": 0.037506103515625, "learning_rate": 1e-06, "loss": 0.0787, "num_tokens": 1755272.0, "reward": 4.0703125, "reward_std": 1.0649116039276123, "rewards/cargo_build_reward": 0.46875, "rewards/cargo_clippy_reward": 0.46875, "rewards/cargo_test_reward": 0.15625, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.8828125, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 738.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 356.53125, "completions/mean_terminated_length": 356.53125, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.027291242362525458, "grad_norm": 0.14099648252527722, "kl": 0.04632568359375, "learning_rate": 1e-06, "loss": 0.0048, "num_tokens": 1782521.0, "reward": 4.2578125, "reward_std": 1.2450398206710815, "rewards/cargo_build_reward": 0.5625, "rewards/cargo_clippy_reward": 0.5625, "rewards/cargo_test_reward": 0.15625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8203125, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 289.75, "completions/mean_terminated_length": 299.0967741935484, "completions/min_length": 0.0, "completions/min_terminated_length": 123.0, "epoch": 0.02769857433808554, "grad_norm": 0.13464657563187157, "kl": 0.0574951171875, "learning_rate": 1e-06, "loss": -0.0053, "num_tokens": 1810108.0, "reward": 4.9140625, "reward_std": 1.5103744268417358, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.9140625, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 951.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 453.84375, "completions/mean_terminated_length": 453.84375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.02810590631364562, "grad_norm": 0.14876203658898152, "kl": 0.0380859375, "learning_rate": 1e-06, "loss": 0.0116, "num_tokens": 1839087.0, "reward": 3.8828125, "reward_std": 1.2079672813415527, "rewards/cargo_build_reward": 0.4375, "rewards/cargo_clippy_reward": 0.4375, "rewards/cargo_test_reward": 0.09375, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8671875, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 293.0625, "completions/mean_terminated_length": 302.51612903225805, "completions/min_length": 0.0, "completions/min_terminated_length": 143.0, "epoch": 0.028513238289205704, "grad_norm": 0.09396813349655643, "kl": 0.3768310546875, "learning_rate": 1e-06, "loss": -0.0073, "num_tokens": 1866810.0, "reward": 3.96875, "reward_std": 0.795851469039917, "rewards/cargo_build_reward": 0.625, "rewards/cargo_clippy_reward": 0.625, "rewards/cargo_test_reward": 0.03125, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.9375, "rewards/tests_have_asserts_reward": 0.734375, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 406.5, "completions/mean_terminated_length": 419.61290322580646, "completions/min_length": 0.0, "completions/min_terminated_length": 215.0, "epoch": 0.028920570264765785, "grad_norm": 0.12598281125371927, "kl": 0.03662109375, "learning_rate": 1e-06, "loss": -0.0031, "num_tokens": 1898261.0, "reward": 3.703125, "reward_std": 1.0857936143875122, "rewards/cargo_build_reward": 0.59375, "rewards/cargo_clippy_reward": 0.59375, "rewards/cargo_test_reward": 0.0, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.578125, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 278.8125, "completions/mean_terminated_length": 278.8125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.029327902240325866, "grad_norm": 0.10930021044923645, "kl": 0.06024169921875, "learning_rate": 1e-06, "loss": 0.0039, "num_tokens": 1921943.0, "reward": 4.8984375, "reward_std": 1.2769497632980347, "rewards/cargo_build_reward": 0.6875, "rewards/cargo_clippy_reward": 0.6875, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7734375, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1112.0, "completions/max_terminated_length": 1112.0, "completions/mean_length": 435.65625, "completions/mean_terminated_length": 435.65625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.029735234215885947, "grad_norm": 0.07778383657694855, "kl": 0.0411376953125, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 1951292.0, "reward": 4.1953125, "reward_std": 0.9303152561187744, "rewards/cargo_build_reward": 0.625, "rewards/cargo_clippy_reward": 0.625, "rewards/cargo_test_reward": 0.1875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.5859375, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 345.0625, "completions/mean_terminated_length": 345.0625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.03014256619144603, "grad_norm": 0.11792150853440085, "kl": 0.08026123046875, "learning_rate": 1e-06, "loss": -0.0052, "num_tokens": 1976758.0, "reward": 4.4453125, "reward_std": 0.9902048110961914, "rewards/cargo_build_reward": 0.59375, "rewards/cargo_clippy_reward": 0.59375, "rewards/cargo_test_reward": 0.25, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.8046875, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 785.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 372.6875, "completions/mean_terminated_length": 372.6875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.03054989816700611, "grad_norm": 0.12309215125848344, "kl": 0.05194091796875, "learning_rate": 1e-06, "loss": -0.0011, "num_tokens": 2004436.0, "reward": 4.4609375, "reward_std": 1.3562747240066528, "rewards/cargo_build_reward": 0.625, "rewards/cargo_clippy_reward": 0.59375, "rewards/cargo_test_reward": 0.1875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8671875, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1073.0, "completions/max_terminated_length": 1073.0, "completions/mean_length": 385.59375, "completions/mean_terminated_length": 385.59375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.03095723014256619, "grad_norm": 0.1384324357143004, "kl": 0.04718017578125, "learning_rate": 1e-06, "loss": 0.0235, "num_tokens": 2031031.0, "reward": 4.34375, "reward_std": 0.9096367955207825, "rewards/cargo_build_reward": 0.53125, "rewards/cargo_clippy_reward": 0.53125, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.5625, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 709.0, "completions/max_terminated_length": 709.0, "completions/mean_length": 277.6875, "completions/mean_terminated_length": 277.6875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.03136456211812627, "grad_norm": 0.07950834144105107, "kl": 0.06903076171875, "learning_rate": 1e-06, "loss": 0.0053, "num_tokens": 2054549.0, "reward": 5.2890625, "reward_std": 0.8124831914901733, "rewards/cargo_build_reward": 0.96875, "rewards/cargo_clippy_reward": 0.96875, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7890625, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 821.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 406.21875, "completions/mean_terminated_length": 406.21875, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.03177189409368635, "grad_norm": 0.133902651952502, "kl": 0.0469970703125, "learning_rate": 1e-06, "loss": 0.0077, "num_tokens": 2081900.0, "reward": 3.5234375, "reward_std": 1.0582976341247559, "rewards/cargo_build_reward": 0.28125, "rewards/cargo_clippy_reward": 0.28125, "rewards/cargo_test_reward": 0.0625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8359375, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 453.71875, "completions/mean_terminated_length": 453.71875, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.03217922606924643, "grad_norm": 0.08836244866516904, "kl": 0.04144287109375, "learning_rate": 1e-06, "loss": 0.0025, "num_tokens": 2111187.0, "reward": 3.5625, "reward_std": 0.8975033760070801, "rewards/cargo_build_reward": 0.375, "rewards/cargo_clippy_reward": 0.375, "rewards/cargo_test_reward": 0.125, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.609375, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 400.09375, "completions/mean_terminated_length": 400.09375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.032586558044806514, "grad_norm": 0.15207937990490714, "kl": 0.18634033203125, "learning_rate": 1e-06, "loss": -0.0043, "num_tokens": 2138638.0, "reward": 3.9765625, "reward_std": 1.2686288356781006, "rewards/cargo_build_reward": 0.53125, "rewards/cargo_clippy_reward": 0.53125, "rewards/cargo_test_reward": 0.09375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7265625, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 300.40625, "completions/mean_terminated_length": 300.40625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.0329938900203666, "grad_norm": 0.10830167291849532, "kl": 0.064697265625, "learning_rate": 1e-06, "loss": -0.0046, "num_tokens": 2162715.0, "reward": 5.515625, "reward_std": 0.6978596448898315, "rewards/cargo_build_reward": 0.9375, "rewards/cargo_clippy_reward": 0.9375, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.890625, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 291.90625, "completions/mean_terminated_length": 291.90625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.03340122199592668, "grad_norm": 0.11140370069913581, "kl": 0.40771484375, "learning_rate": 1e-06, "loss": 0.0024, "num_tokens": 2187448.0, "reward": 4.1875, "reward_std": 1.369483232498169, "rewards/cargo_build_reward": 0.46875, "rewards/cargo_clippy_reward": 0.46875, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.703125, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 337.53125, "completions/mean_terminated_length": 337.53125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.033808553971486764, "grad_norm": 0.13283768108323413, "kl": 0.0582275390625, "learning_rate": 1e-06, "loss": 0.0048, "num_tokens": 2213129.0, "reward": 4.265625, "reward_std": 1.1658533811569214, "rewards/cargo_build_reward": 0.625, "rewards/cargo_clippy_reward": 0.625, "rewards/cargo_test_reward": 0.25, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.515625, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 797.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 435.125, "completions/mean_terminated_length": 449.16129032258067, "completions/min_length": 0.0, "completions/min_terminated_length": 286.0, "epoch": 0.034215885947046845, "grad_norm": 0.12189998293491974, "kl": 0.12158203125, "learning_rate": 1e-06, "loss": -0.0031, "num_tokens": 2245827.0, "reward": 3.5625, "reward_std": 1.343709111213684, "rewards/cargo_build_reward": 0.375, "rewards/cargo_clippy_reward": 0.34375, "rewards/cargo_test_reward": 0.03125, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.9375, "rewards/tests_have_asserts_reward": 0.859375, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1060.0, "completions/max_terminated_length": 1060.0, "completions/mean_length": 320.21875, "completions/mean_terminated_length": 320.21875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.034623217922606926, "grad_norm": 0.10492427493641471, "kl": 0.0643310546875, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 2270802.0, "reward": 5.015625, "reward_std": 1.1473376750946045, "rewards/cargo_build_reward": 0.75, "rewards/cargo_clippy_reward": 0.75, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.765625, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 881.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 331.34375, "completions/mean_terminated_length": 331.34375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.03503054989816701, "grad_norm": 0.11451060016867645, "kl": 0.06793212890625, "learning_rate": 1e-06, "loss": -0.0082, "num_tokens": 2295861.0, "reward": 4.5390625, "reward_std": 1.3130030632019043, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.25, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.7421875, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 800.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 389.65625, "completions/mean_terminated_length": 389.65625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.03543788187372709, "grad_norm": 0.06681902057762526, "kl": 0.05255126953125, "learning_rate": 1e-06, "loss": 0.0026, "num_tokens": 2324050.0, "reward": 4.015625, "reward_std": 0.4756559729576111, "rewards/cargo_build_reward": 0.46875, "rewards/cargo_clippy_reward": 0.46875, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.515625, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 330.5, "completions/mean_terminated_length": 330.5, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.03584521384928717, "grad_norm": 0.12738046714303833, "kl": 0.0687255859375, "learning_rate": 1e-06, "loss": -0.0012, "num_tokens": 2349850.0, "reward": 4.4375, "reward_std": 1.194375991821289, "rewards/cargo_build_reward": 0.59375, "rewards/cargo_clippy_reward": 0.59375, "rewards/cargo_test_reward": 0.3125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.625, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 370.9375, "completions/mean_terminated_length": 370.9375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.03625254582484725, "grad_norm": 0.07802640302364765, "kl": 0.0546875, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 2376280.0, "reward": 5.1015625, "reward_std": 0.8754923343658447, "rewards/cargo_build_reward": 0.6875, "rewards/cargo_clippy_reward": 0.6875, "rewards/cargo_test_reward": 0.53125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.6640625, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 389.5, "completions/mean_terminated_length": 389.5, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.03665987780040733, "grad_norm": 0.09057003503703503, "kl": 0.05352783203125, "learning_rate": 1e-06, "loss": -0.0053, "num_tokens": 2403880.0, "reward": 4.4296875, "reward_std": 0.9106569886207581, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.15625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8046875, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 291.4375, "completions/mean_terminated_length": 291.4375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.03706720977596741, "grad_norm": 0.13704527818486306, "kl": 0.0699462890625, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 2428718.0, "reward": 4.7890625, "reward_std": 1.4855718612670898, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.6171875, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 256.5625, "completions/mean_terminated_length": 256.5625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.03747454175152749, "grad_norm": 0.08634017282421738, "kl": 0.109619140625, "learning_rate": 1e-06, "loss": -0.002, "num_tokens": 2451736.0, "reward": 5.5234375, "reward_std": 1.2359544038772583, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.46875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8984375, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 329.53125, "completions/mean_terminated_length": 329.53125, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.037881873727087574, "grad_norm": 0.10753217283790532, "kl": 0.06671142578125, "learning_rate": 1e-06, "loss": -0.0011, "num_tokens": 2476761.0, "reward": 4.34375, "reward_std": 1.0839296579360962, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.78125, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 805.0, "completions/max_terminated_length": 805.0, "completions/mean_length": 372.09375, "completions/mean_terminated_length": 372.09375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.038289205702647655, "grad_norm": 0.10051808748142209, "kl": 0.059814453125, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 2503772.0, "reward": 4.4765625, "reward_std": 0.886406421661377, "rewards/cargo_build_reward": 0.625, "rewards/cargo_clippy_reward": 0.625, "rewards/cargo_test_reward": 0.21875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7890625, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1232.0, "completions/max_terminated_length": 1232.0, "completions/mean_length": 536.03125, "completions/mean_terminated_length": 536.03125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.038696537678207736, "grad_norm": 0.14038449079335694, "kl": 0.0394287109375, "learning_rate": 1e-06, "loss": 0.0056, "num_tokens": 2535957.0, "reward": 3.6328125, "reward_std": 1.128450632095337, "rewards/cargo_build_reward": 0.40625, "rewards/cargo_clippy_reward": 0.40625, "rewards/cargo_test_reward": 0.09375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.6484375, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1085.0, "completions/max_terminated_length": 1085.0, "completions/mean_length": 388.4375, "completions/mean_terminated_length": 388.4375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.039103869653767824, "grad_norm": 0.09311033485193937, "kl": 0.0550537109375, "learning_rate": 1e-06, "loss": 0.0038, "num_tokens": 2562459.0, "reward": 3.703125, "reward_std": 0.8902084827423096, "rewards/cargo_build_reward": 0.40625, "rewards/cargo_clippy_reward": 0.40625, "rewards/cargo_test_reward": 0.0, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.890625, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1293.0, "completions/max_terminated_length": 1293.0, "completions/mean_length": 422.40625, "completions/mean_terminated_length": 422.40625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.039511201629327905, "grad_norm": 0.09004957679418751, "kl": 0.0880126953125, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 2591568.0, "reward": 3.609375, "reward_std": 0.8331196904182434, "rewards/cargo_build_reward": 0.4375, "rewards/cargo_clippy_reward": 0.4375, "rewards/cargo_test_reward": 0.15625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.4375, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 362.4375, "completions/mean_terminated_length": 362.4375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.039918533604887986, "grad_norm": 0.06577931737389649, "kl": 0.59649658203125, "learning_rate": 1e-06, "loss": -0.0015, "num_tokens": 2618342.0, "reward": 4.0, "reward_std": 0.5188412666320801, "rewards/cargo_build_reward": 0.46875, "rewards/cargo_clippy_reward": 0.46875, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.9375, "rewards/tests_have_asserts_reward": 0.578125, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 356.78125, "completions/mean_terminated_length": 356.78125, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.04032586558044807, "grad_norm": 0.054377706261197915, "kl": 0.058837890625, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 2645143.0, "reward": 3.5546875, "reward_std": 0.5849505662918091, "rewards/cargo_build_reward": 0.375, "rewards/cargo_clippy_reward": 0.375, "rewards/cargo_test_reward": 0.15625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.4921875, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 354.625, "completions/mean_terminated_length": 354.625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.04073319755600815, "grad_norm": 0.139261212805125, "kl": 0.05682373046875, "learning_rate": 1e-06, "loss": 0.0034, "num_tokens": 2671251.0, "reward": 3.8125, "reward_std": 1.3117740154266357, "rewards/cargo_build_reward": 0.5, "rewards/cargo_clippy_reward": 0.5, "rewards/cargo_test_reward": 0.15625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.5, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 780.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 409.3125, "completions/mean_terminated_length": 409.3125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.04114052953156823, "grad_norm": 0.09072193629484701, "kl": 0.0150299072265625, "learning_rate": 1e-06, "loss": 0.0014, "num_tokens": 2699653.0, "reward": 3.9375, "reward_std": 0.8187613487243652, "rewards/cargo_build_reward": 0.40625, "rewards/cargo_clippy_reward": 0.40625, "rewards/cargo_test_reward": 0.21875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.703125, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 380.53125, "completions/mean_terminated_length": 380.53125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.04154786150712831, "grad_norm": 0.08558946344667806, "kl": 0.016571044921875, "learning_rate": 1e-06, "loss": 0.0027, "num_tokens": 2727030.0, "reward": 4.828125, "reward_std": 0.8606547117233276, "rewards/cargo_build_reward": 0.6875, "rewards/cargo_clippy_reward": 0.6875, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.890625, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 304.53125, "completions/mean_terminated_length": 304.53125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.04195519348268839, "grad_norm": 0.08305197700331375, "kl": 0.020263671875, "learning_rate": 1e-06, "loss": 0.0061, "num_tokens": 2752087.0, "reward": 5.3984375, "reward_std": 0.8094767928123474, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.5, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7734375, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 413.5, "completions/mean_terminated_length": 413.5, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.04236252545824847, "grad_norm": 0.11857185466291319, "kl": 0.0153045654296875, "learning_rate": 1e-06, "loss": -0.0022, "num_tokens": 2780375.0, "reward": 4.1953125, "reward_std": 1.06247878074646, "rewards/cargo_build_reward": 0.625, "rewards/cargo_clippy_reward": 0.625, "rewards/cargo_test_reward": 0.0625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8203125, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1137.0, "completions/max_terminated_length": 1137.0, "completions/mean_length": 517.5, "completions/mean_terminated_length": 517.5, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.04276985743380855, "grad_norm": 0.13490652078592621, "kl": 0.011444091796875, "learning_rate": 1e-06, "loss": 0.0152, "num_tokens": 2811935.0, "reward": 3.5703125, "reward_std": 1.3479619026184082, "rewards/cargo_build_reward": 0.34375, "rewards/cargo_clippy_reward": 0.34375, "rewards/cargo_test_reward": 0.125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.6328125, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 320.78125, "completions/mean_terminated_length": 320.78125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.043177189409368634, "grad_norm": 0.0943869231894059, "kl": 0.01837158203125, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 2837240.0, "reward": 3.703125, "reward_std": 0.8034542798995972, "rewards/cargo_build_reward": 0.40625, "rewards/cargo_clippy_reward": 0.375, "rewards/cargo_test_reward": 0.0, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.921875, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 734.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 314.59375, "completions/mean_terminated_length": 314.59375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.043584521384928715, "grad_norm": 0.08439674479274882, "kl": 0.0181884765625, "learning_rate": 1e-06, "loss": 0.0063, "num_tokens": 2862835.0, "reward": 4.9765625, "reward_std": 0.824439287185669, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.46875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7265625, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 399.0, "completions/mean_terminated_length": 399.0, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.043991853360488796, "grad_norm": 0.10557785012511325, "kl": 0.0165557861328125, "learning_rate": 1e-06, "loss": -0.0059, "num_tokens": 2891315.0, "reward": 4.3046875, "reward_std": 1.1515986919403076, "rewards/cargo_build_reward": 0.59375, "rewards/cargo_clippy_reward": 0.59375, "rewards/cargo_test_reward": 0.09375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9296875, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 260.4375, "completions/mean_terminated_length": 260.4375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.04439918533604888, "grad_norm": 0.10739264731467306, "kl": 0.155548095703125, "learning_rate": 1e-06, "loss": -0.0048, "num_tokens": 2915001.0, "reward": 5.0625, "reward_std": 1.2184569835662842, "rewards/cargo_build_reward": 0.78125, "rewards/cargo_clippy_reward": 0.78125, "rewards/cargo_test_reward": 0.40625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.703125, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1135.0, "completions/max_terminated_length": 1135.0, "completions/mean_length": 423.625, "completions/mean_terminated_length": 423.625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.04480651731160896, "grad_norm": 0.11203578083151883, "kl": 0.015380859375, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 2943325.0, "reward": 3.921875, "reward_std": 1.066467523574829, "rewards/cargo_build_reward": 0.5, "rewards/cargo_clippy_reward": 0.5, "rewards/cargo_test_reward": 0.125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.6875, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1089.0, "completions/max_terminated_length": 1089.0, "completions/mean_length": 472.90625, "completions/mean_terminated_length": 472.90625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.045213849287169046, "grad_norm": 0.11142257826861593, "kl": 0.0133209228515625, "learning_rate": 1e-06, "loss": 0.0022, "num_tokens": 2973514.0, "reward": 3.6171875, "reward_std": 0.9596100449562073, "rewards/cargo_build_reward": 0.34375, "rewards/cargo_clippy_reward": 0.34375, "rewards/cargo_test_reward": 0.0, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9296875, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 334.59375, "completions/mean_terminated_length": 334.59375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.04562118126272913, "grad_norm": 0.12774013251987382, "kl": 0.01898193359375, "learning_rate": 1e-06, "loss": -0.0067, "num_tokens": 2999165.0, "reward": 3.9921875, "reward_std": 1.0958621501922607, "rewards/cargo_build_reward": 0.5625, "rewards/cargo_clippy_reward": 0.5625, "rewards/cargo_test_reward": 0.09375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.6796875, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2746.0, "completions/max_terminated_length": 2746.0, "completions/mean_length": 454.71875, "completions/mean_terminated_length": 454.71875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.04602851323828921, "grad_norm": 0.11556314604735626, "kl": 0.016754150390625, "learning_rate": 1e-06, "loss": -0.0358, "num_tokens": 3029132.0, "reward": 4.8828125, "reward_std": 0.7093173265457153, "rewards/cargo_build_reward": 0.625, "rewards/cargo_clippy_reward": 0.625, "rewards/cargo_test_reward": 0.5, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.6796875, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 312.5, "completions/mean_terminated_length": 312.5, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.04643584521384929, "grad_norm": 0.12511619983156722, "kl": 0.01885986328125, "learning_rate": 1e-06, "loss": 0.0068, "num_tokens": 3054100.0, "reward": 3.9375, "reward_std": 1.0332987308502197, "rewards/cargo_build_reward": 0.5625, "rewards/cargo_clippy_reward": 0.5625, "rewards/cargo_test_reward": 0.03125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.75, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2299.0, "completions/max_terminated_length": 2299.0, "completions/mean_length": 482.34375, "completions/mean_terminated_length": 482.34375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.04684317718940937, "grad_norm": 0.15488648519429415, "kl": 0.014068603515625, "learning_rate": 1e-06, "loss": 0.0049, "num_tokens": 3084839.0, "reward": 3.8828125, "reward_std": 1.3750832080841064, "rewards/cargo_build_reward": 0.40625, "rewards/cargo_clippy_reward": 0.40625, "rewards/cargo_test_reward": 0.15625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7578125, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 957.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 339.0, "completions/mean_terminated_length": 339.0, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.04725050916496945, "grad_norm": 0.11958016309066218, "kl": 0.0204925537109375, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 3110687.0, "reward": 4.6875, "reward_std": 1.2157671451568604, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8125, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 753.0, "completions/max_terminated_length": 753.0, "completions/mean_length": 477.84375, "completions/mean_terminated_length": 477.84375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.04765784114052953, "grad_norm": 0.13733313268090042, "kl": 0.01507568359375, "learning_rate": 1e-06, "loss": 0.0029, "num_tokens": 3141714.0, "reward": 3.4921875, "reward_std": 1.2556400299072266, "rewards/cargo_build_reward": 0.34375, "rewards/cargo_clippy_reward": 0.34375, "rewards/cargo_test_reward": 0.09375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.6171875, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 414.28125, "completions/mean_terminated_length": 414.28125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.04806517311608961, "grad_norm": 0.11384370016057642, "kl": 0.01507568359375, "learning_rate": 1e-06, "loss": 0.0081, "num_tokens": 3170355.0, "reward": 3.96875, "reward_std": 0.9767351150512695, "rewards/cargo_build_reward": 0.46875, "rewards/cargo_clippy_reward": 0.46875, "rewards/cargo_test_reward": 0.125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.78125, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 816.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 499.59375, "completions/mean_terminated_length": 499.59375, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.048472505091649694, "grad_norm": 0.192732149328862, "kl": 0.035491943359375, "learning_rate": 1e-06, "loss": -0.0033, "num_tokens": 3201262.0, "reward": 3.515625, "reward_std": 1.357240915298462, "rewards/cargo_build_reward": 0.375, "rewards/cargo_clippy_reward": 0.375, "rewards/cargo_test_reward": 0.125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.515625, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1158.0, "completions/max_terminated_length": 1158.0, "completions/mean_length": 353.4375, "completions/mean_terminated_length": 353.4375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.048879837067209775, "grad_norm": 0.090119599820571, "kl": 0.019805908203125, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 3228220.0, "reward": 4.90625, "reward_std": 1.0091047286987305, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.21875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.71875, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 891.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 382.375, "completions/mean_terminated_length": 382.375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.049287169042769856, "grad_norm": 0.12271168318363614, "kl": 0.023101806640625, "learning_rate": 1e-06, "loss": -0.0033, "num_tokens": 3255104.0, "reward": 4.34375, "reward_std": 1.167395830154419, "rewards/cargo_build_reward": 0.5625, "rewards/cargo_clippy_reward": 0.5625, "rewards/cargo_test_reward": 0.15625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.921875, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1436.0, "completions/max_terminated_length": 1436.0, "completions/mean_length": 459.75, "completions/mean_terminated_length": 474.5806451612903, "completions/min_length": 0.0, "completions/min_terminated_length": 252.0, "epoch": 0.04969450101832994, "grad_norm": 0.09261390577655294, "kl": 0.0178375244140625, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 3287877.0, "reward": 3.421875, "reward_std": 0.9010084867477417, "rewards/cargo_build_reward": 0.3125, "rewards/cargo_clippy_reward": 0.3125, "rewards/cargo_test_reward": 0.0, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.9375, "rewards/tests_have_asserts_reward": 0.875, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 735.0, "completions/max_terminated_length": 735.0, "completions/mean_length": 448.9375, "completions/mean_terminated_length": 448.9375, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.05010183299389002, "grad_norm": 0.11138776361064398, "kl": 0.018585205078125, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 3317739.0, "reward": 3.8828125, "reward_std": 1.1341464519500732, "rewards/cargo_build_reward": 0.6875, "rewards/cargo_clippy_reward": 0.5625, "rewards/cargo_test_reward": 0.0625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.5078125, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 277.53125, "completions/mean_terminated_length": 286.48387096774195, "completions/min_length": 0.0, "completions/min_terminated_length": 113.0, "epoch": 0.0505091649694501, "grad_norm": 0.17014362985083628, "kl": 1.129119873046875, "learning_rate": 1e-06, "loss": -0.016, "num_tokens": 3345155.0, "reward": 4.921875, "reward_std": 1.6797964572906494, "rewards/cargo_build_reward": 0.78125, "rewards/cargo_clippy_reward": 0.78125, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.9375, "rewards/test_block_count_reward": 0.875, "rewards/tests_have_asserts_reward": 0.75, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1349.0, "completions/max_terminated_length": 1349.0, "completions/mean_length": 545.5625, "completions/mean_terminated_length": 545.5625, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.05091649694501019, "grad_norm": 0.12622940865825613, "kl": 0.0236358642578125, "learning_rate": 1e-06, "loss": 0.0141, "num_tokens": 3378373.0, "reward": 4.0859375, "reward_std": 1.0542640686035156, "rewards/cargo_build_reward": 0.625, "rewards/cargo_clippy_reward": 0.59375, "rewards/cargo_test_reward": 0.0625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7421875, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 278.90625, "completions/mean_terminated_length": 278.90625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.05132382892057027, "grad_norm": 0.08852836024214183, "kl": 0.04669189453125, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 3401898.0, "reward": 5.5859375, "reward_std": 1.0553646087646484, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.5, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8359375, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 257.625, "completions/mean_terminated_length": 257.625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.05173116089613035, "grad_norm": 0.09176336366373557, "kl": 0.058837890625, "learning_rate": 1e-06, "loss": -0.001, "num_tokens": 3425278.0, "reward": 5.8359375, "reward_std": 0.834545373916626, "rewards/cargo_build_reward": 0.96875, "rewards/cargo_clippy_reward": 0.96875, "rewards/cargo_test_reward": 0.59375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7109375, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1231.0, "completions/max_terminated_length": 1231.0, "completions/mean_length": 451.875, "completions/mean_terminated_length": 451.875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.05213849287169043, "grad_norm": 0.12774418089567324, "kl": 0.02215576171875, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 3455458.0, "reward": 4.796875, "reward_std": 1.1469030380249023, "rewards/cargo_build_reward": 0.75, "rewards/cargo_clippy_reward": 0.75, "rewards/cargo_test_reward": 0.1875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.921875, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 712.0, "completions/max_terminated_length": 712.0, "completions/mean_length": 378.75, "completions/mean_terminated_length": 378.75, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.05254582484725051, "grad_norm": 0.0980242741140148, "kl": 0.0496826171875, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 3482530.0, "reward": 5.0390625, "reward_std": 0.8885169625282288, "rewards/cargo_build_reward": 0.71875, "rewards/cargo_clippy_reward": 0.71875, "rewards/cargo_test_reward": 0.4375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7265625, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 373.03125, "completions/mean_terminated_length": 373.03125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.05295315682281059, "grad_norm": 0.12695718263684108, "kl": 0.030242919921875, "learning_rate": 1e-06, "loss": 0.0026, "num_tokens": 3509611.0, "reward": 4.484375, "reward_std": 1.4467928409576416, "rewards/cargo_build_reward": 0.59375, "rewards/cargo_clippy_reward": 0.59375, "rewards/cargo_test_reward": 0.25, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.796875, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 353.21875, "completions/mean_terminated_length": 353.21875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.05336048879837067, "grad_norm": 0.12938313975955237, "kl": 0.029388427734375, "learning_rate": 1e-06, "loss": -0.0038, "num_tokens": 3536002.0, "reward": 4.4296875, "reward_std": 1.170635461807251, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.15625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8046875, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 350.0, "completions/mean_terminated_length": 350.0, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.053767820773930754, "grad_norm": 0.12555362474992476, "kl": 0.203582763671875, "learning_rate": 1e-06, "loss": -0.007, "num_tokens": 3563258.0, "reward": 5.046875, "reward_std": 1.3916959762573242, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.5, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.375, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 352.90625, "completions/mean_terminated_length": 352.90625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.054175152749490835, "grad_norm": 0.12782522321914772, "kl": 0.0283203125, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 3589367.0, "reward": 4.6796875, "reward_std": 1.2642593383789062, "rewards/cargo_build_reward": 0.71875, "rewards/cargo_clippy_reward": 0.71875, "rewards/cargo_test_reward": 0.1875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8671875, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 360.25, "completions/mean_terminated_length": 360.25, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.054582484725050916, "grad_norm": 0.10633934376190354, "kl": 0.201812744140625, "learning_rate": 1e-06, "loss": -0.0015, "num_tokens": 3616071.0, "reward": 4.3046875, "reward_std": 0.8920255303382874, "rewards/cargo_build_reward": 0.71875, "rewards/cargo_clippy_reward": 0.71875, "rewards/cargo_test_reward": 0.0625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7421875, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 255.8125, "completions/mean_terminated_length": 255.8125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.054989816700611, "grad_norm": 0.09032036141210152, "kl": 0.068634033203125, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 3639481.0, "reward": 5.3046875, "reward_std": 0.9606647491455078, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.7578125, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1321.0, "completions/max_terminated_length": 1321.0, "completions/mean_length": 560.0625, "completions/mean_terminated_length": 560.0625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.05539714867617108, "grad_norm": 0.11312472825908094, "kl": 0.026641845703125, "learning_rate": 1e-06, "loss": -0.0026, "num_tokens": 3672323.0, "reward": 3.8359375, "reward_std": 1.0763511657714844, "rewards/cargo_build_reward": 0.34375, "rewards/cargo_clippy_reward": 0.34375, "rewards/cargo_test_reward": 0.1875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.7890625, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 328.46875, "completions/mean_terminated_length": 328.46875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.05580448065173116, "grad_norm": 0.1488737904564473, "kl": 1.011474609375, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 3697154.0, "reward": 5.15625, "reward_std": 1.3229000568389893, "rewards/cargo_build_reward": 0.78125, "rewards/cargo_clippy_reward": 0.78125, "rewards/cargo_test_reward": 0.40625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.78125, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 880.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 465.53125, "completions/mean_terminated_length": 465.53125, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.05621181262729124, "grad_norm": 0.11183649813224111, "kl": 0.04339599609375, "learning_rate": 1e-06, "loss": 0.0037, "num_tokens": 3727339.0, "reward": 4.5078125, "reward_std": 0.8816056251525879, "rewards/cargo_build_reward": 0.5625, "rewards/cargo_clippy_reward": 0.5625, "rewards/cargo_test_reward": 0.21875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9453125, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1028.0, "completions/max_terminated_length": 1028.0, "completions/mean_length": 449.1875, "completions/mean_terminated_length": 495.6551724137931, "completions/min_length": 0.0, "completions/min_terminated_length": 198.0, "epoch": 0.05661914460285132, "grad_norm": 0.2009998859785612, "kl": 0.022979736328125, "learning_rate": 1e-06, "loss": -0.0419, "num_tokens": 3767171.0, "reward": 3.6171875, "reward_std": 1.3957762718200684, "rewards/cargo_build_reward": 0.53125, "rewards/cargo_clippy_reward": 0.53125, "rewards/cargo_test_reward": 0.03125, "rewards/code_block_count_reward": 0.90625, "rewards/non_empty_reward": 0.90625, "rewards/test_block_count_reward": 0.90625, "rewards/tests_have_asserts_reward": 0.6796875, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 368.53125, "completions/mean_terminated_length": 380.4193548387097, "completions/min_length": 0.0, "completions/min_terminated_length": 201.0, "epoch": 0.05702647657841141, "grad_norm": 0.1257378479708751, "kl": 0.02899169921875, "learning_rate": 1e-06, "loss": -0.0132, "num_tokens": 3797809.0, "reward": 5.1640625, "reward_std": 1.1951626539230347, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.8515625, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1415.0, "completions/max_terminated_length": 1415.0, "completions/mean_length": 429.625, "completions/mean_terminated_length": 429.625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.05743380855397149, "grad_norm": 0.1312714485926572, "kl": 0.04241943359375, "learning_rate": 1e-06, "loss": 0.0111, "num_tokens": 3826541.0, "reward": 4.8984375, "reward_std": 1.2038742303848267, "rewards/cargo_build_reward": 0.6875, "rewards/cargo_clippy_reward": 0.6875, "rewards/cargo_test_reward": 0.4375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.6484375, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 365.15625, "completions/mean_terminated_length": 365.15625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.05784114052953157, "grad_norm": 0.1220036429767914, "kl": 0.042510986328125, "learning_rate": 1e-06, "loss": 0.0049, "num_tokens": 3852890.0, "reward": 5.2265625, "reward_std": 1.383144736289978, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.40625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7890625, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 244.9375, "completions/mean_terminated_length": 244.9375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.05824847250509165, "grad_norm": 0.13772309161184693, "kl": 0.0450439453125, "learning_rate": 1e-06, "loss": 0.0075, "num_tokens": 3875944.0, "reward": 5.8125, "reward_std": 1.4358084201812744, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.6875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.703125, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 348.71875, "completions/mean_terminated_length": 359.96774193548384, "completions/min_length": 0.0, "completions/min_terminated_length": 192.0, "epoch": 0.05865580448065173, "grad_norm": 0.13794321630421855, "kl": 0.03729248046875, "learning_rate": 1e-06, "loss": -0.0147, "num_tokens": 3906347.0, "reward": 4.65625, "reward_std": 1.218233346939087, "rewards/cargo_build_reward": 0.75, "rewards/cargo_clippy_reward": 0.71875, "rewards/cargo_test_reward": 0.21875, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.8125, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 292.625, "completions/mean_terminated_length": 292.625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.059063136456211814, "grad_norm": 0.13730681937853773, "kl": 0.04180908203125, "learning_rate": 1e-06, "loss": -0.0029, "num_tokens": 3931079.0, "reward": 4.921875, "reward_std": 1.142471432685852, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.3125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.671875, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 290.4375, "completions/mean_terminated_length": 299.80645161290323, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.059470468431771895, "grad_norm": 0.14981601476659048, "kl": 0.05426025390625, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 3959162.0, "reward": 4.734375, "reward_std": 1.2687596082687378, "rewards/cargo_build_reward": 0.75, "rewards/cargo_clippy_reward": 0.75, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.734375, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 274.125, "completions/mean_terminated_length": 274.125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.059877800407331976, "grad_norm": 0.13171278590355248, "kl": 0.03729248046875, "learning_rate": 1e-06, "loss": 0.0037, "num_tokens": 3982822.0, "reward": 4.734375, "reward_std": 1.3779085874557495, "rewards/cargo_build_reward": 0.6875, "rewards/cargo_clippy_reward": 0.6875, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.609375, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1145.0, "completions/max_terminated_length": 1145.0, "completions/mean_length": 370.4375, "completions/mean_terminated_length": 370.4375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.06028513238289206, "grad_norm": 0.06528599734141968, "kl": 0.09295654296875, "learning_rate": 1e-06, "loss": 0.0075, "num_tokens": 4009284.0, "reward": 4.484375, "reward_std": 0.6813257336616516, "rewards/cargo_build_reward": 0.59375, "rewards/cargo_clippy_reward": 0.59375, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.546875, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 301.625, "completions/mean_terminated_length": 301.625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.06069246435845214, "grad_norm": 0.14054431347384852, "kl": 0.40924072265625, "learning_rate": 1e-06, "loss": 0.0026, "num_tokens": 4034544.0, "reward": 4.7734375, "reward_std": 1.5535788536071777, "rewards/cargo_build_reward": 0.71875, "rewards/cargo_clippy_reward": 0.71875, "rewards/cargo_test_reward": 0.3125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.7265625, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 289.3125, "completions/mean_terminated_length": 289.3125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.06109979633401222, "grad_norm": 0.10053438279511238, "kl": 0.16851806640625, "learning_rate": 1e-06, "loss": 0.0028, "num_tokens": 4058338.0, "reward": 5.328125, "reward_std": 1.0581912994384766, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.953125, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 276.28125, "completions/mean_terminated_length": 276.28125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.0615071283095723, "grad_norm": 0.0999585842466013, "kl": 0.150634765625, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 4082147.0, "reward": 5.703125, "reward_std": 0.9247983694076538, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.5, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.953125, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 914.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 479.0, "completions/mean_terminated_length": 479.0, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.06191446028513238, "grad_norm": 0.13048559366632506, "kl": 0.03289794921875, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 4112499.0, "reward": 4.4453125, "reward_std": 1.2833878993988037, "rewards/cargo_build_reward": 0.53125, "rewards/cargo_clippy_reward": 0.53125, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8203125, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1075.0, "completions/max_terminated_length": 1075.0, "completions/mean_length": 480.875, "completions/mean_terminated_length": 480.875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.06232179226069246, "grad_norm": 0.06546655673170758, "kl": 0.120361328125, "learning_rate": 1e-06, "loss": -0.0033, "num_tokens": 4142655.0, "reward": 4.1015625, "reward_std": 0.5515722632408142, "rewards/cargo_build_reward": 0.59375, "rewards/cargo_clippy_reward": 0.5625, "rewards/cargo_test_reward": 0.0, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9453125, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1025.0, "completions/max_terminated_length": 1025.0, "completions/mean_length": 377.25, "completions/mean_terminated_length": 377.25, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.06272912423625254, "grad_norm": 0.14368937929694342, "kl": 0.06414794921875, "learning_rate": 1e-06, "loss": -0.002, "num_tokens": 4169063.0, "reward": 4.8359375, "reward_std": 1.4678257703781128, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7734375, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 868.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 387.0, "completions/mean_terminated_length": 399.48387096774195, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.06313645621181263, "grad_norm": 0.14528266911903864, "kl": 0.094482421875, "learning_rate": 1e-06, "loss": -0.0087, "num_tokens": 4200145.0, "reward": 4.3359375, "reward_std": 0.9601633548736572, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.03125, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.6484375, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 332.25, "completions/mean_terminated_length": 342.96774193548384, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.0635437881873727, "grad_norm": 0.12773570890855301, "kl": 0.0762939453125, "learning_rate": 1e-06, "loss": -0.0061, "num_tokens": 4229192.0, "reward": 4.25, "reward_std": 1.2361493110656738, "rewards/cargo_build_reward": 0.5625, "rewards/cargo_clippy_reward": 0.5625, "rewards/cargo_test_reward": 0.21875, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.734375, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 387.4375, "completions/mean_terminated_length": 387.4375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.06395112016293279, "grad_norm": 0.09874887518593596, "kl": 0.0712890625, "learning_rate": 1e-06, "loss": -0.0029, "num_tokens": 4257246.0, "reward": 4.3359375, "reward_std": 1.028090000152588, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.15625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7109375, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1465.0, "completions/max_terminated_length": 1465.0, "completions/mean_length": 542.46875, "completions/mean_terminated_length": 559.9677419354839, "completions/min_length": 0.0, "completions/min_terminated_length": 246.0, "epoch": 0.06435845213849287, "grad_norm": 0.127277549418077, "kl": 0.029144287109375, "learning_rate": 1e-06, "loss": -0.0043, "num_tokens": 4293910.0, "reward": 4.296875, "reward_std": 0.9780712127685547, "rewards/cargo_build_reward": 0.46875, "rewards/cargo_clippy_reward": 0.46875, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.65625, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 374.5, "completions/mean_terminated_length": 374.5, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.06476578411405295, "grad_norm": 0.13527228501810654, "kl": 0.06500244140625, "learning_rate": 1e-06, "loss": -0.0046, "num_tokens": 4321214.0, "reward": 4.6953125, "reward_std": 1.41321861743927, "rewards/cargo_build_reward": 0.6875, "rewards/cargo_clippy_reward": 0.6875, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.6328125, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 414.46875, "completions/mean_terminated_length": 414.46875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.06517311608961303, "grad_norm": 0.12072646535966278, "kl": 0.06292724609375, "learning_rate": 1e-06, "loss": -0.0072, "num_tokens": 4348813.0, "reward": 4.2890625, "reward_std": 1.082193374633789, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.0625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8515625, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 469.96875, "completions/mean_terminated_length": 469.96875, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.06558044806517312, "grad_norm": 0.10543401512652517, "kl": 0.0916748046875, "learning_rate": 1e-06, "loss": -0.0042, "num_tokens": 4379212.0, "reward": 3.265625, "reward_std": 0.684702455997467, "rewards/cargo_build_reward": 0.15625, "rewards/cargo_clippy_reward": 0.15625, "rewards/cargo_test_reward": 0.0, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.953125, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 277.59375, "completions/mean_terminated_length": 277.59375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.0659877800407332, "grad_norm": 0.11898224685444178, "kl": 0.0526123046875, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 4403623.0, "reward": 5.1171875, "reward_std": 1.2387746572494507, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.5, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.4921875, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 333.78125, "completions/mean_terminated_length": 344.5483870967742, "completions/min_length": 0.0, "completions/min_terminated_length": 169.0, "epoch": 0.06639511201629328, "grad_norm": 0.13795453576396063, "kl": 0.272216796875, "learning_rate": 1e-06, "loss": -0.0219, "num_tokens": 4432845.0, "reward": 5.109375, "reward_std": 1.3525934219360352, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.796875, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 398.25, "completions/mean_terminated_length": 398.25, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.06680244399185337, "grad_norm": 0.10619325377296591, "kl": 0.0416259765625, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 4461021.0, "reward": 4.4609375, "reward_std": 0.8485924005508423, "rewards/cargo_build_reward": 0.78125, "rewards/cargo_clippy_reward": 0.78125, "rewards/cargo_test_reward": 0.0625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7734375, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 336.375, "completions/mean_terminated_length": 336.375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.06720977596741344, "grad_norm": 0.12246407602729473, "kl": 0.34552001953125, "learning_rate": 1e-06, "loss": 0.0197, "num_tokens": 4486409.0, "reward": 5.3828125, "reward_std": 1.0585535764694214, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.4375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8828125, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 220.0625, "completions/mean_terminated_length": 220.0625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.06761710794297353, "grad_norm": 0.12973837432013488, "kl": 0.320556640625, "learning_rate": 1e-06, "loss": 0.0063, "num_tokens": 4507763.0, "reward": 5.7578125, "reward_std": 1.0621415376663208, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.65625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7578125, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 885.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 356.28125, "completions/mean_terminated_length": 356.28125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.0680244399185336, "grad_norm": 0.13690153487600087, "kl": 0.09442138671875, "learning_rate": 1e-06, "loss": 0.014, "num_tokens": 4534548.0, "reward": 4.3671875, "reward_std": 1.2160022258758545, "rewards/cargo_build_reward": 0.625, "rewards/cargo_clippy_reward": 0.625, "rewards/cargo_test_reward": 0.15625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8046875, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 515.25, "completions/mean_terminated_length": 531.8709677419355, "completions/min_length": 0.0, "completions/min_terminated_length": 203.0, "epoch": 0.06843177189409369, "grad_norm": 0.14113580127363495, "kl": 0.053314208984375, "learning_rate": 1e-06, "loss": -0.0049, "num_tokens": 4569828.0, "reward": 3.75, "reward_std": 1.1177539825439453, "rewards/cargo_build_reward": 0.4375, "rewards/cargo_clippy_reward": 0.4375, "rewards/cargo_test_reward": 0.03125, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.875, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 466.65625, "completions/mean_terminated_length": 466.65625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.06883910386965376, "grad_norm": 0.14096201917283618, "kl": 0.081207275390625, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 4599761.0, "reward": 4.1484375, "reward_std": 1.1017944812774658, "rewards/cargo_build_reward": 0.625, "rewards/cargo_clippy_reward": 0.625, "rewards/cargo_test_reward": 0.09375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.7265625, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 326.875, "completions/mean_terminated_length": 326.875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.06924643584521385, "grad_norm": 0.12424183311593773, "kl": 0.0482177734375, "learning_rate": 1e-06, "loss": 0.0022, "num_tokens": 4624853.0, "reward": 4.9296875, "reward_std": 1.4418165683746338, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9296875, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 405.59375, "completions/mean_terminated_length": 405.59375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.06965376782077393, "grad_norm": 0.15246901793772144, "kl": 0.03509521484375, "learning_rate": 1e-06, "loss": 0.0032, "num_tokens": 4652536.0, "reward": 4.6953125, "reward_std": 1.1468729972839355, "rewards/cargo_build_reward": 0.5625, "rewards/cargo_clippy_reward": 0.5625, "rewards/cargo_test_reward": 0.3125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9453125, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 393.15625, "completions/mean_terminated_length": 393.15625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.07006109979633401, "grad_norm": 0.10859157260107942, "kl": 0.07415771484375, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 4680557.0, "reward": 4.890625, "reward_std": 1.0702264308929443, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.71875, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1665.0, "completions/max_terminated_length": 1665.0, "completions/mean_length": 468.5625, "completions/mean_terminated_length": 468.5625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.07046843177189409, "grad_norm": 0.08929775595064933, "kl": 0.03656005859375, "learning_rate": 1e-06, "loss": -0.0019, "num_tokens": 4710519.0, "reward": 4.1640625, "reward_std": 0.8054457902908325, "rewards/cargo_build_reward": 0.625, "rewards/cargo_clippy_reward": 0.625, "rewards/cargo_test_reward": 0.125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.6640625, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 294.8125, "completions/mean_terminated_length": 294.8125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.07087576374745418, "grad_norm": 0.10009815562182411, "kl": 0.04638671875, "learning_rate": 1e-06, "loss": -0.0036, "num_tokens": 4734401.0, "reward": 5.078125, "reward_std": 1.4493857622146606, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.59375, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 976.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 511.3125, "completions/mean_terminated_length": 511.3125, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.07128309572301425, "grad_norm": 0.15151117326113023, "kl": 0.60736083984375, "learning_rate": 1e-06, "loss": 0.0156, "num_tokens": 4765819.0, "reward": 4.46875, "reward_std": 1.2704614400863647, "rewards/cargo_build_reward": 0.75, "rewards/cargo_clippy_reward": 0.75, "rewards/cargo_test_reward": 0.1875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.59375, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 357.28125, "completions/mean_terminated_length": 357.28125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.07169042769857434, "grad_norm": 0.07947011852154963, "kl": 0.04974365234375, "learning_rate": 1e-06, "loss": 0.0022, "num_tokens": 4791852.0, "reward": 4.640625, "reward_std": 0.7895081043243408, "rewards/cargo_build_reward": 0.75, "rewards/cargo_clippy_reward": 0.75, "rewards/cargo_test_reward": 0.1875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.765625, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 512.96875, "completions/mean_terminated_length": 512.96875, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.07209775967413443, "grad_norm": 0.10662422981590515, "kl": 0.0921630859375, "learning_rate": 1e-06, "loss": -0.0095, "num_tokens": 4823419.0, "reward": 3.4375, "reward_std": 0.9471868276596069, "rewards/cargo_build_reward": 0.3125, "rewards/cargo_clippy_reward": 0.3125, "rewards/cargo_test_reward": 0.0, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8125, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 299.03125, "completions/mean_terminated_length": 299.03125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.0725050916496945, "grad_norm": 0.08467705969064211, "kl": 0.04705810546875, "learning_rate": 1e-06, "loss": 0.0039, "num_tokens": 4847428.0, "reward": 5.0625, "reward_std": 0.700248122215271, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.25, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9375, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 330.0, "completions/mean_terminated_length": 330.0, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.07291242362525459, "grad_norm": 0.1323091423700111, "kl": 0.1751708984375, "learning_rate": 1e-06, "loss": 0.0062, "num_tokens": 4872932.0, "reward": 4.7890625, "reward_std": 1.2005279064178467, "rewards/cargo_build_reward": 0.71875, "rewards/cargo_clippy_reward": 0.71875, "rewards/cargo_test_reward": 0.25, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8515625, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 381.96875, "completions/mean_terminated_length": 381.96875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.07331975560081466, "grad_norm": 0.12808650426853077, "kl": 0.0498046875, "learning_rate": 1e-06, "loss": 0.0063, "num_tokens": 4900195.0, "reward": 4.8046875, "reward_std": 0.9538981914520264, "rewards/cargo_build_reward": 0.625, "rewards/cargo_clippy_reward": 0.625, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8046875, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 304.84375, "completions/mean_terminated_length": 304.84375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.07372708757637475, "grad_norm": 0.10331216337152113, "kl": 0.07073974609375, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 4925254.0, "reward": 4.8984375, "reward_std": 0.8840129375457764, "rewards/cargo_build_reward": 0.78125, "rewards/cargo_clippy_reward": 0.78125, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.6640625, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 273.34375, "completions/mean_terminated_length": 273.34375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.07413441955193482, "grad_norm": 0.12623768958158862, "kl": 0.0665283203125, "learning_rate": 1e-06, "loss": 0.0015, "num_tokens": 4948985.0, "reward": 4.84375, "reward_std": 1.2009623050689697, "rewards/cargo_build_reward": 0.78125, "rewards/cargo_clippy_reward": 0.78125, "rewards/cargo_test_reward": 0.1875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.90625, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1525.0, "completions/max_terminated_length": 1525.0, "completions/mean_length": 422.0, "completions/mean_terminated_length": 422.0, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.07454175152749491, "grad_norm": 0.12579157587186837, "kl": 0.046142578125, "learning_rate": 1e-06, "loss": 0.0087, "num_tokens": 4977897.0, "reward": 4.859375, "reward_std": 1.0070652961730957, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.25, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.71875, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 799.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 426.53125, "completions/mean_terminated_length": 426.53125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.07494908350305499, "grad_norm": 0.13752570104041778, "kl": 0.041015625, "learning_rate": 1e-06, "loss": -0.002, "num_tokens": 5007234.0, "reward": 4.1328125, "reward_std": 1.2121552228927612, "rewards/cargo_build_reward": 0.46875, "rewards/cargo_clippy_reward": 0.46875, "rewards/cargo_test_reward": 0.25, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.6953125, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 765.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 434.125, "completions/mean_terminated_length": 434.125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.07535641547861507, "grad_norm": 0.11594259359655909, "kl": 0.06817626953125, "learning_rate": 1e-06, "loss": 0.0086, "num_tokens": 5036142.0, "reward": 4.8203125, "reward_std": 0.9454158544540405, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.25, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.6796875, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 298.6875, "completions/mean_terminated_length": 298.6875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.07576374745417515, "grad_norm": 0.1308820961805529, "kl": 0.05621337890625, "learning_rate": 1e-06, "loss": 0.0082, "num_tokens": 5060300.0, "reward": 4.8671875, "reward_std": 1.0690581798553467, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.21875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.6796875, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 379.40625, "completions/mean_terminated_length": 379.40625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.07617107942973524, "grad_norm": 0.10645042668270711, "kl": 0.10748291015625, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 5087345.0, "reward": 5.0078125, "reward_std": 1.0140846967697144, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.3125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7265625, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/max_terminated_length": 665.0, "completions/mean_length": 374.34375, "completions/mean_terminated_length": 374.34375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.07657841140529531, "grad_norm": 0.15089256688296765, "kl": 0.9410400390625, "learning_rate": 1e-06, "loss": -0.0055, "num_tokens": 5115100.0, "reward": 4.4921875, "reward_std": 1.379470705986023, "rewards/cargo_build_reward": 0.625, "rewards/cargo_clippy_reward": 0.625, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.6796875, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 365.0, "completions/mean_terminated_length": 365.0, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.0769857433808554, "grad_norm": 0.1569538088020158, "kl": 0.1473388671875, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 5142796.0, "reward": 4.1015625, "reward_std": 1.5070769786834717, "rewards/cargo_build_reward": 0.5625, "rewards/cargo_clippy_reward": 0.5625, "rewards/cargo_test_reward": 0.25, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.4921875, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 363.0, "completions/mean_terminated_length": 363.0, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.07739307535641547, "grad_norm": 0.05935078934065327, "kl": 0.0694580078125, "learning_rate": 1e-06, "loss": 0.0024, "num_tokens": 5169356.0, "reward": 4.4375, "reward_std": 0.42242497205734253, "rewards/cargo_build_reward": 0.5, "rewards/cargo_clippy_reward": 0.5, "rewards/cargo_test_reward": 0.25, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9375, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 298.71875, "completions/mean_terminated_length": 298.71875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.07780040733197556, "grad_norm": 0.12545164979791618, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": -0.0034, "num_tokens": 5192971.0, "reward": 4.484375, "reward_std": 1.0269687175750732, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.15625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.859375, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1053.0, "completions/max_terminated_length": 1053.0, "completions/mean_length": 457.71875, "completions/mean_terminated_length": 457.71875, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.07820773930753565, "grad_norm": 0.14005548709107163, "kl": 0.04852294921875, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 5222666.0, "reward": 4.3046875, "reward_std": 1.2621102333068848, "rewards/cargo_build_reward": 0.625, "rewards/cargo_clippy_reward": 0.625, "rewards/cargo_test_reward": 0.15625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7421875, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 286.3125, "completions/mean_terminated_length": 286.3125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.07861507128309572, "grad_norm": 0.1185698427295713, "kl": 0.12652587890625, "learning_rate": 1e-06, "loss": -0.0057, "num_tokens": 5246404.0, "reward": 5.3828125, "reward_std": 1.0377273559570312, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.5625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.5078125, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3218.0, "completions/max_terminated_length": 3218.0, "completions/mean_length": 491.1875, "completions/mean_terminated_length": 491.1875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.07902240325865581, "grad_norm": 0.13419747893981798, "kl": 0.167236328125, "learning_rate": 1e-06, "loss": 0.0038, "num_tokens": 5277970.0, "reward": 4.6171875, "reward_std": 0.9943197965621948, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.625, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7734375, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 392.84375, "completions/mean_terminated_length": 392.84375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.07942973523421588, "grad_norm": 0.1553068882473102, "kl": 1.0399169921875, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 5306125.0, "reward": 4.6171875, "reward_std": 1.373223066329956, "rewards/cargo_build_reward": 0.6875, "rewards/cargo_clippy_reward": 0.6875, "rewards/cargo_test_reward": 0.3125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.6171875, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 932.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 328.75, "completions/mean_terminated_length": 328.75, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.07983706720977597, "grad_norm": 0.09792381388479776, "kl": 0.07232666015625, "learning_rate": 1e-06, "loss": -0.0034, "num_tokens": 5331549.0, "reward": 5.34375, "reward_std": 0.6353054046630859, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.40625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8125, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 727.0, "completions/max_terminated_length": 727.0, "completions/mean_length": 387.15625, "completions/mean_terminated_length": 387.15625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.08024439918533605, "grad_norm": 0.1475298153624464, "kl": 0.15081787109375, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 5358842.0, "reward": 4.359375, "reward_std": 1.2290772199630737, "rewards/cargo_build_reward": 0.59375, "rewards/cargo_clippy_reward": 0.59375, "rewards/cargo_test_reward": 0.1875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.796875, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 314.46875, "completions/mean_terminated_length": 314.46875, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.08065173116089613, "grad_norm": 0.10582358722949149, "kl": 0.053466796875, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 5384809.0, "reward": 5.765625, "reward_std": 1.0285557508468628, "rewards/cargo_build_reward": 0.9375, "rewards/cargo_clippy_reward": 0.9375, "rewards/cargo_test_reward": 0.59375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.703125, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 328.90625, "completions/mean_terminated_length": 328.90625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.08105906313645621, "grad_norm": 0.15206191338594935, "kl": 0.2681884765625, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 5411046.0, "reward": 5.0625, "reward_std": 0.9651427268981934, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.46875, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.859375, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2047.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 358.4375, "completions/mean_terminated_length": 358.4375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.0814663951120163, "grad_norm": 0.1075376922189615, "kl": 0.386962890625, "learning_rate": 1e-06, "loss": 0.0125, "num_tokens": 5437500.0, "reward": 6.1171875, "reward_std": 1.1414694786071777, "rewards/cargo_build_reward": 0.96875, "rewards/cargo_clippy_reward": 0.96875, "rewards/cargo_test_reward": 0.625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9296875, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 349.46875, "completions/mean_terminated_length": 349.46875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.08187372708757637, "grad_norm": 0.07006979994831898, "kl": 0.0355072021484375, "learning_rate": 1e-06, "loss": -0.0013, "num_tokens": 5463139.0, "reward": 4.484375, "reward_std": 0.5211516618728638, "rewards/cargo_build_reward": 0.78125, "rewards/cargo_clippy_reward": 0.78125, "rewards/cargo_test_reward": 0.0625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.796875, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1048.0, "completions/max_terminated_length": 1048.0, "completions/mean_length": 343.75, "completions/mean_terminated_length": 343.75, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.08228105906313646, "grad_norm": 0.13950721322854165, "kl": 0.053985595703125, "learning_rate": 1e-06, "loss": 0.0046, "num_tokens": 5489619.0, "reward": 4.4765625, "reward_std": 1.311608910560608, "rewards/cargo_build_reward": 0.625, "rewards/cargo_clippy_reward": 0.625, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.5390625, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1506.0, "completions/max_terminated_length": 1506.0, "completions/mean_length": 564.96875, "completions/mean_terminated_length": 564.96875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.08268839103869653, "grad_norm": 0.13863625665109122, "kl": 0.014312744140625, "learning_rate": 1e-06, "loss": 0.0034, "num_tokens": 5523226.0, "reward": 3.9453125, "reward_std": 1.123884916305542, "rewards/cargo_build_reward": 0.59375, "rewards/cargo_clippy_reward": 0.59375, "rewards/cargo_test_reward": 0.1875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.3828125, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1253.0, "completions/max_terminated_length": 1253.0, "completions/mean_length": 361.25, "completions/mean_terminated_length": 361.25, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.08309572301425662, "grad_norm": 0.0941997688772263, "kl": 0.0204620361328125, "learning_rate": 1e-06, "loss": -0.0088, "num_tokens": 5549954.0, "reward": 5.8125, "reward_std": 0.9149646759033203, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.59375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.875, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 800.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 381.34375, "completions/mean_terminated_length": 381.34375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.0835030549898167, "grad_norm": 0.12969089104595022, "kl": 0.10968017578125, "learning_rate": 1e-06, "loss": -0.0017, "num_tokens": 5577925.0, "reward": 4.6875, "reward_std": 1.2291152477264404, "rewards/cargo_build_reward": 0.6875, "rewards/cargo_clippy_reward": 0.6875, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.75, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 905.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 369.09375, "completions/mean_terminated_length": 369.09375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.08391038696537678, "grad_norm": 0.12012906410037177, "kl": 0.021697998046875, "learning_rate": 1e-06, "loss": 0.0019, "num_tokens": 5604616.0, "reward": 4.9765625, "reward_std": 1.0764740705490112, "rewards/cargo_build_reward": 0.71875, "rewards/cargo_clippy_reward": 0.71875, "rewards/cargo_test_reward": 0.40625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7265625, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 874.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 411.78125, "completions/mean_terminated_length": 411.78125, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.08431771894093687, "grad_norm": 0.12298712714257735, "kl": 0.051971435546875, "learning_rate": 1e-06, "loss": 0.0013, "num_tokens": 5632393.0, "reward": 4.21875, "reward_std": 1.062387228012085, "rewards/cargo_build_reward": 0.6875, "rewards/cargo_clippy_reward": 0.6875, "rewards/cargo_test_reward": 0.03125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.78125, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 280.75, "completions/mean_terminated_length": 280.75, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.08472505091649694, "grad_norm": 0.09282797754422388, "kl": 0.34716796875, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 5656425.0, "reward": 5.234375, "reward_std": 1.1892833709716797, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.4375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.734375, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 313.3125, "completions/mean_terminated_length": 323.4193548387097, "completions/min_length": 0.0, "completions/min_terminated_length": 195.0, "epoch": 0.08513238289205703, "grad_norm": 0.13863374958378186, "kl": 0.033966064453125, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 5685387.0, "reward": 4.7734375, "reward_std": 1.572913408279419, "rewards/cargo_build_reward": 0.78125, "rewards/cargo_clippy_reward": 0.75, "rewards/cargo_test_reward": 0.25, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.8046875, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 322.625, "completions/mean_terminated_length": 322.625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.0855397148676171, "grad_norm": 0.16979858976128354, "kl": 0.07171630859375, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 5710951.0, "reward": 4.671875, "reward_std": 1.4163511991500854, "rewards/cargo_build_reward": 0.75, "rewards/cargo_clippy_reward": 0.75, "rewards/cargo_test_reward": 0.1875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.796875, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 357.3125, "completions/mean_terminated_length": 357.3125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.0859470468431772, "grad_norm": 0.13105401371206019, "kl": 0.0201873779296875, "learning_rate": 1e-06, "loss": -0.0027, "num_tokens": 5737321.0, "reward": 5.5234375, "reward_std": 1.11106276512146, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.40625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9609375, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 758.0, "completions/max_terminated_length": 758.0, "completions/mean_length": 346.1875, "completions/mean_terminated_length": 346.1875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.08635437881873727, "grad_norm": 0.0925506339214318, "kl": 0.02349853515625, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 5763687.0, "reward": 5.15625, "reward_std": 0.7716832756996155, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.65625, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 903.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 413.4375, "completions/mean_terminated_length": 413.4375, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.08676171079429736, "grad_norm": 0.10279410328832947, "kl": 0.021636962890625, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 5791749.0, "reward": 4.75, "reward_std": 0.8146636486053467, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.75, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 887.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 449.03125, "completions/mean_terminated_length": 449.03125, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.08716904276985743, "grad_norm": 0.11318549135026058, "kl": 0.0506591796875, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 5821478.0, "reward": 4.296875, "reward_std": 0.9958111643791199, "rewards/cargo_build_reward": 0.71875, "rewards/cargo_clippy_reward": 0.71875, "rewards/cargo_test_reward": 0.03125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.796875, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 356.09375, "completions/mean_terminated_length": 356.09375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.08757637474541752, "grad_norm": 0.11381759941293987, "kl": 0.024627685546875, "learning_rate": 1e-06, "loss": 0.0076, "num_tokens": 5847425.0, "reward": 4.921875, "reward_std": 1.115452527999878, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.1875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.921875, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 356.8125, "completions/mean_terminated_length": 356.8125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.08798370672097759, "grad_norm": 0.11961343666388408, "kl": 0.028656005859375, "learning_rate": 1e-06, "loss": 0.0026, "num_tokens": 5874387.0, "reward": 5.171875, "reward_std": 1.093719482421875, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.84375, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 389.5625, "completions/mean_terminated_length": 389.5625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.08839103869653768, "grad_norm": 0.18720520794622916, "kl": 0.021575927734375, "learning_rate": 1e-06, "loss": 0.0061, "num_tokens": 5901989.0, "reward": 4.3125, "reward_std": 1.3922038078308105, "rewards/cargo_build_reward": 0.625, "rewards/cargo_clippy_reward": 0.625, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.5625, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1038.0, "completions/max_terminated_length": 1038.0, "completions/mean_length": 456.875, "completions/mean_terminated_length": 456.875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.08879837067209775, "grad_norm": 0.09293499024827523, "kl": 0.038543701171875, "learning_rate": 1e-06, "loss": -0.0013, "num_tokens": 5931257.0, "reward": 4.484375, "reward_std": 0.7560322880744934, "rewards/cargo_build_reward": 0.53125, "rewards/cargo_clippy_reward": 0.53125, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.671875, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 425.125, "completions/mean_terminated_length": 425.125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.08920570264765784, "grad_norm": 0.16333880577412674, "kl": 0.03631591796875, "learning_rate": 1e-06, "loss": 0.0145, "num_tokens": 5959469.0, "reward": 4.609375, "reward_std": 1.1915391683578491, "rewards/cargo_build_reward": 0.6875, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.1875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.890625, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 334.53125, "completions/mean_terminated_length": 334.53125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.08961303462321792, "grad_norm": 0.13580664034432854, "kl": 0.13531494140625, "learning_rate": 1e-06, "loss": -0.0021, "num_tokens": 5985382.0, "reward": 4.8203125, "reward_std": 1.308077096939087, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.3125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.5703125, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 423.8125, "completions/mean_terminated_length": 423.8125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.090020366598778, "grad_norm": 0.18168422733726539, "kl": 0.136383056640625, "learning_rate": 1e-06, "loss": 0.0086, "num_tokens": 6014240.0, "reward": 3.90625, "reward_std": 1.5534679889678955, "rewards/cargo_build_reward": 0.40625, "rewards/cargo_clippy_reward": 0.40625, "rewards/cargo_test_reward": 0.25, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.59375, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 322.6875, "completions/mean_terminated_length": 322.6875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.09042769857433809, "grad_norm": 0.11897470122290314, "kl": 0.063751220703125, "learning_rate": 1e-06, "loss": 0.0039, "num_tokens": 6039182.0, "reward": 5.609375, "reward_std": 1.1206417083740234, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.5, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.859375, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 341.5625, "completions/mean_terminated_length": 352.5806451612903, "completions/min_length": 0.0, "completions/min_terminated_length": 220.0, "epoch": 0.09083503054989817, "grad_norm": 0.14033574758499026, "kl": 0.049041748046875, "learning_rate": 1e-06, "loss": -0.0134, "num_tokens": 6068859.0, "reward": 4.8515625, "reward_std": 1.2350282669067383, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.625, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.8828125, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 925.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 455.90625, "completions/mean_terminated_length": 455.90625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.09124236252545825, "grad_norm": 0.14672812399725055, "kl": 0.032989501953125, "learning_rate": 1e-06, "loss": 0.0102, "num_tokens": 6097864.0, "reward": 4.78125, "reward_std": 0.9682382941246033, "rewards/cargo_build_reward": 0.71875, "rewards/cargo_clippy_reward": 0.71875, "rewards/cargo_test_reward": 0.3125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.71875, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 290.40625, "completions/mean_terminated_length": 290.40625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.09164969450101833, "grad_norm": 0.10355516835859621, "kl": 0.031402587890625, "learning_rate": 1e-06, "loss": 0.0036, "num_tokens": 6122701.0, "reward": 5.3515625, "reward_std": 1.052793264389038, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.46875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7890625, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 851.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 345.96875, "completions/mean_terminated_length": 357.1290322580645, "completions/min_length": 0.0, "completions/min_terminated_length": 174.0, "epoch": 0.09205702647657842, "grad_norm": 0.18047877874778687, "kl": 0.073516845703125, "learning_rate": 1e-06, "loss": -0.0048, "num_tokens": 6152920.0, "reward": 4.3203125, "reward_std": 1.6043800115585327, "rewards/cargo_build_reward": 0.53125, "rewards/cargo_clippy_reward": 0.53125, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.6328125, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 326.96875, "completions/mean_terminated_length": 326.96875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.09246435845213849, "grad_norm": 0.13959796301881863, "kl": 0.035247802734375, "learning_rate": 1e-06, "loss": 0.0054, "num_tokens": 6178071.0, "reward": 5.046875, "reward_std": 1.2643322944641113, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.21875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.984375, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 783.0, "completions/max_terminated_length": 783.0, "completions/mean_length": 391.0625, "completions/mean_terminated_length": 391.0625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.09287169042769858, "grad_norm": 0.15727370379536706, "kl": 0.157135009765625, "learning_rate": 1e-06, "loss": 0.0027, "num_tokens": 6205689.0, "reward": 4.8515625, "reward_std": 1.229414939880371, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.3125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.6015625, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 369.34375, "completions/mean_terminated_length": 369.34375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.09327902240325865, "grad_norm": 0.09634590946427983, "kl": 0.025482177734375, "learning_rate": 1e-06, "loss": 0.0056, "num_tokens": 6232316.0, "reward": 5.6953125, "reward_std": 0.7354626059532166, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.5, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9453125, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 264.03125, "completions/mean_terminated_length": 264.03125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.09368635437881874, "grad_norm": 0.06070469187112743, "kl": 0.042938232421875, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 6255941.0, "reward": 5.4296875, "reward_std": 0.6073668003082275, "rewards/cargo_build_reward": 0.96875, "rewards/cargo_clippy_reward": 0.96875, "rewards/cargo_test_reward": 0.3125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8671875, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 271.09375, "completions/mean_terminated_length": 271.09375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.09409368635437881, "grad_norm": 0.11551822096300121, "kl": 0.043609619140625, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 6279656.0, "reward": 5.6015625, "reward_std": 1.2009453773498535, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.5625, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7109375, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 261.375, "completions/mean_terminated_length": 261.375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.0945010183299389, "grad_norm": 0.09422765503292208, "kl": 0.05816650390625, "learning_rate": 1e-06, "loss": -0.0013, "num_tokens": 6303220.0, "reward": 5.96875, "reward_std": 1.0341272354125977, "rewards/cargo_build_reward": 0.96875, "rewards/cargo_clippy_reward": 0.96875, "rewards/cargo_test_reward": 0.65625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.71875, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 376.0625, "completions/mean_terminated_length": 376.0625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.09490835030549898, "grad_norm": 0.30940106585791494, "kl": 3.4390869140625, "learning_rate": 1e-06, "loss": 0.0067, "num_tokens": 6330718.0, "reward": 4.4453125, "reward_std": 0.7659948468208313, "rewards/cargo_build_reward": 0.75, "rewards/cargo_clippy_reward": 0.75, "rewards/cargo_test_reward": 0.0625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8203125, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 322.09375, "completions/mean_terminated_length": 322.09375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.09531568228105906, "grad_norm": 0.11744059123433084, "kl": 0.3782958984375, "learning_rate": 1e-06, "loss": 0.0034, "num_tokens": 6354857.0, "reward": 5.2734375, "reward_std": 0.7889077663421631, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8984375, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 376.5625, "completions/mean_terminated_length": 376.5625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.09572301425661914, "grad_norm": 0.13686359963866218, "kl": 0.032012939453125, "learning_rate": 1e-06, "loss": 0.019, "num_tokens": 6381963.0, "reward": 4.96875, "reward_std": 1.042311668395996, "rewards/cargo_build_reward": 0.75, "rewards/cargo_clippy_reward": 0.75, "rewards/cargo_test_reward": 0.40625, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.703125, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 323.46875, "completions/mean_terminated_length": 323.46875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.09613034623217923, "grad_norm": 0.12819766906736543, "kl": 0.61492919921875, "learning_rate": 1e-06, "loss": 0.0043, "num_tokens": 6407570.0, "reward": 5.4765625, "reward_std": 1.0316227674484253, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.4375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8515625, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 847.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 464.125, "completions/mean_terminated_length": 464.125, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.09653767820773931, "grad_norm": 0.1752475302003345, "kl": 0.0914306640625, "learning_rate": 1e-06, "loss": -0.008, "num_tokens": 6438094.0, "reward": 4.5234375, "reward_std": 1.512213945388794, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.5234375, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1079.0, "completions/max_terminated_length": 1079.0, "completions/mean_length": 331.5, "completions/mean_terminated_length": 331.5, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.09694501018329939, "grad_norm": 0.07139200316241337, "kl": 0.04351806640625, "learning_rate": 1e-06, "loss": 0.0051, "num_tokens": 6463982.0, "reward": 5.2421875, "reward_std": 0.496543288230896, "rewards/cargo_build_reward": 0.78125, "rewards/cargo_clippy_reward": 0.78125, "rewards/cargo_test_reward": 0.5, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.6796875, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1147.0, "completions/max_terminated_length": 1147.0, "completions/mean_length": 422.0625, "completions/mean_terminated_length": 422.0625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.09735234215885948, "grad_norm": 0.1619291541344302, "kl": 0.6241455078125, "learning_rate": 1e-06, "loss": 0.0068, "num_tokens": 6492720.0, "reward": 4.640625, "reward_std": 1.1303948163986206, "rewards/cargo_build_reward": 0.59375, "rewards/cargo_clippy_reward": 0.59375, "rewards/cargo_test_reward": 0.3125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.828125, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 439.625, "completions/mean_terminated_length": 453.80645161290323, "completions/min_length": 0.0, "completions/min_terminated_length": 284.0, "epoch": 0.09775967413441955, "grad_norm": 0.1570995884574694, "kl": 0.5472412109375, "learning_rate": 1e-06, "loss": -0.0084, "num_tokens": 6526062.0, "reward": 3.5078125, "reward_std": 1.067907691001892, "rewards/cargo_build_reward": 0.375, "rewards/cargo_clippy_reward": 0.375, "rewards/cargo_test_reward": 0.0, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8046875, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 359.375, "completions/mean_terminated_length": 359.375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.09816700610997964, "grad_norm": 0.12425362441168204, "kl": 0.06414794921875, "learning_rate": 1e-06, "loss": -0.0018, "num_tokens": 6552658.0, "reward": 5.015625, "reward_std": 1.295576572418213, "rewards/cargo_build_reward": 0.71875, "rewards/cargo_clippy_reward": 0.71875, "rewards/cargo_test_reward": 0.4375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.703125, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 889.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 328.0, "completions/mean_terminated_length": 328.0, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.09857433808553971, "grad_norm": 0.11487237634073325, "kl": 0.05828857421875, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 6578202.0, "reward": 5.3046875, "reward_std": 1.1267704963684082, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8046875, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 821.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 365.59375, "completions/mean_terminated_length": 365.59375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.0989816700610998, "grad_norm": 0.06695475981025639, "kl": 0.4957275390625, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 6605173.0, "reward": 4.8515625, "reward_std": 0.5640454888343811, "rewards/cargo_build_reward": 0.9375, "rewards/cargo_clippy_reward": 0.9375, "rewards/cargo_test_reward": 0.0625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8515625, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 349.625, "completions/mean_terminated_length": 349.625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.09938900203665987, "grad_norm": 0.14586003600460437, "kl": 0.06005859375, "learning_rate": 1e-06, "loss": 0.0049, "num_tokens": 6631137.0, "reward": 5.2421875, "reward_std": 1.2937591075897217, "rewards/cargo_build_reward": 0.71875, "rewards/cargo_clippy_reward": 0.6875, "rewards/cargo_test_reward": 0.5, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8359375, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 758.0, "completions/max_terminated_length": 758.0, "completions/mean_length": 386.1875, "completions/mean_terminated_length": 398.64516129032256, "completions/min_length": 0.0, "completions/min_terminated_length": 178.0, "epoch": 0.09979633401221996, "grad_norm": 0.1485284452112313, "kl": 0.14459228515625, "learning_rate": 1e-06, "loss": -0.0252, "num_tokens": 6661599.0, "reward": 4.6875, "reward_std": 1.3923399448394775, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.3125, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.8125, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 398.8125, "completions/mean_terminated_length": 398.8125, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.10020366598778004, "grad_norm": 0.20333341422047588, "kl": 0.19537353515625, "learning_rate": 1e-06, "loss": 0.0027, "num_tokens": 6689665.0, "reward": 4.6796875, "reward_std": 1.5273429155349731, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8046875, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 276.40625, "completions/mean_terminated_length": 285.3225806451613, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.10061099796334012, "grad_norm": 0.1406114255986296, "kl": 0.095458984375, "learning_rate": 1e-06, "loss": -0.0096, "num_tokens": 6717143.0, "reward": 4.6328125, "reward_std": 1.586996078491211, "rewards/cargo_build_reward": 0.6875, "rewards/cargo_clippy_reward": 0.6875, "rewards/cargo_test_reward": 0.21875, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.8828125, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 380.375, "completions/mean_terminated_length": 380.375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.1010183299389002, "grad_norm": 0.14199568353692169, "kl": 0.06744384765625, "learning_rate": 1e-06, "loss": 0.0073, "num_tokens": 6743987.0, "reward": 5.2421875, "reward_std": 1.2977920770645142, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.4375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7421875, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 357.0, "completions/mean_terminated_length": 357.0, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.10142566191446029, "grad_norm": 0.07421016421352952, "kl": 0.05303955078125, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 6770803.0, "reward": 4.34375, "reward_std": 0.5166193842887878, "rewards/cargo_build_reward": 0.71875, "rewards/cargo_clippy_reward": 0.71875, "rewards/cargo_test_reward": 0.09375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.71875, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 452.90625, "completions/mean_terminated_length": 452.90625, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.10183299389002037, "grad_norm": 0.0931240885796066, "kl": 0.086395263671875, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 6800216.0, "reward": 4.21875, "reward_std": 0.7194794416427612, "rewards/cargo_build_reward": 0.59375, "rewards/cargo_clippy_reward": 0.59375, "rewards/cargo_test_reward": 0.1875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.65625, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 390.3125, "completions/mean_terminated_length": 390.3125, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.10224032586558045, "grad_norm": 0.08245473465062367, "kl": 0.0628662109375, "learning_rate": 1e-06, "loss": 0.0024, "num_tokens": 6827746.0, "reward": 5.1015625, "reward_std": 0.829085111618042, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.21875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9140625, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 916.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 485.78125, "completions/mean_terminated_length": 485.78125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.10264765784114054, "grad_norm": 0.114105467050597, "kl": 0.035552978515625, "learning_rate": 1e-06, "loss": 0.0059, "num_tokens": 6858803.0, "reward": 5.09375, "reward_std": 0.9395735859870911, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.53125, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 773.0, "completions/max_terminated_length": 773.0, "completions/mean_length": 375.03125, "completions/mean_terminated_length": 375.03125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.10305498981670061, "grad_norm": 0.08651622443758208, "kl": 0.0655517578125, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 6885972.0, "reward": 4.21875, "reward_std": 0.8213375806808472, "rewards/cargo_build_reward": 0.5625, "rewards/cargo_clippy_reward": 0.5625, "rewards/cargo_test_reward": 0.1875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.71875, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 324.8125, "completions/mean_terminated_length": 324.8125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.1034623217922607, "grad_norm": 0.13429322999276266, "kl": 0.105712890625, "learning_rate": 1e-06, "loss": 0.0036, "num_tokens": 6911958.0, "reward": 4.6640625, "reward_std": 1.0780224800109863, "rewards/cargo_build_reward": 0.75, "rewards/cargo_clippy_reward": 0.71875, "rewards/cargo_test_reward": 0.1875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8203125, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 359.5, "completions/mean_terminated_length": 359.5, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.10386965376782077, "grad_norm": 0.10690645919088833, "kl": 0.058624267578125, "learning_rate": 1e-06, "loss": -0.0031, "num_tokens": 6938582.0, "reward": 5.46875, "reward_std": 0.9955792427062988, "rewards/cargo_build_reward": 0.96875, "rewards/cargo_clippy_reward": 0.96875, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.84375, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 432.65625, "completions/mean_terminated_length": 432.65625, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.10427698574338086, "grad_norm": 0.13001312245704905, "kl": 0.067626953125, "learning_rate": 1e-06, "loss": 0.0047, "num_tokens": 6966995.0, "reward": 3.6796875, "reward_std": 1.1213611364364624, "rewards/cargo_build_reward": 0.40625, "rewards/cargo_clippy_reward": 0.40625, "rewards/cargo_test_reward": 0.0625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7421875, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1419.0, "completions/max_terminated_length": 1419.0, "completions/mean_length": 372.71875, "completions/mean_terminated_length": 372.71875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.10468431771894093, "grad_norm": 0.11192869953967434, "kl": 0.06536865234375, "learning_rate": 1e-06, "loss": 0.0033, "num_tokens": 6994042.0, "reward": 4.65625, "reward_std": 1.0012075901031494, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.78125, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 3202.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 403.0, "completions/mean_terminated_length": 323.1333333333333, "completions/min_length": 0.0, "completions/min_terminated_length": 173.0, "epoch": 0.10509164969450102, "grad_norm": 0.1265180220715068, "kl": 0.129150390625, "learning_rate": 1e-06, "loss": 0.064, "num_tokens": 7025338.0, "reward": 4.46875, "reward_std": 1.0521612167358398, "rewards/cargo_build_reward": 0.625, "rewards/cargo_clippy_reward": 0.59375, "rewards/cargo_test_reward": 0.25, "rewards/code_block_count_reward": 0.9375, "rewards/non_empty_reward": 0.9375, "rewards/test_block_count_reward": 0.9375, "rewards/tests_have_asserts_reward": 0.875, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 342.375, "completions/mean_terminated_length": 342.375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.1054989816700611, "grad_norm": 0.1285258846222483, "kl": 0.093505859375, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 7050894.0, "reward": 5.03125, "reward_std": 1.1087009906768799, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.84375, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 862.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 336.53125, "completions/mean_terminated_length": 336.53125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.10590631364562118, "grad_norm": 0.15302848713988015, "kl": 0.21875, "learning_rate": 1e-06, "loss": 0.0047, "num_tokens": 7076143.0, "reward": 5.1171875, "reward_std": 1.249140977859497, "rewards/cargo_build_reward": 0.75, "rewards/cargo_clippy_reward": 0.71875, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8984375, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 310.75, "completions/mean_terminated_length": 310.75, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.10631364562118126, "grad_norm": 0.1659479395384913, "kl": 0.153564453125, "learning_rate": 1e-06, "loss": -0.0035, "num_tokens": 7100943.0, "reward": 5.0625, "reward_std": 1.3257355690002441, "rewards/cargo_build_reward": 0.78125, "rewards/cargo_clippy_reward": 0.78125, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.796875, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 302.9375, "completions/mean_terminated_length": 302.9375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.10672097759674135, "grad_norm": 0.1403295319168937, "kl": 0.0731201171875, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 7124325.0, "reward": 6.0078125, "reward_std": 1.2643563747406006, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.75, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7421875, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 404.65625, "completions/mean_terminated_length": 404.65625, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.10712830957230142, "grad_norm": 0.13674927081734936, "kl": 0.04754638671875, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 7152450.0, "reward": 5.1484375, "reward_std": 1.1286545991897583, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.25, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8359375, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 398.6875, "completions/mean_terminated_length": 398.6875, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.10753564154786151, "grad_norm": 0.18572905209539947, "kl": 0.1798095703125, "learning_rate": 1e-06, "loss": 0.0075, "num_tokens": 7180888.0, "reward": 4.5078125, "reward_std": 1.3470971584320068, "rewards/cargo_build_reward": 0.71875, "rewards/cargo_clippy_reward": 0.6875, "rewards/cargo_test_reward": 0.1875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.7421875, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 845.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 368.78125, "completions/mean_terminated_length": 368.78125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.1079429735234216, "grad_norm": 0.15649574599231655, "kl": 0.49713134765625, "learning_rate": 1e-06, "loss": 0.0062, "num_tokens": 7207641.0, "reward": 5.4609375, "reward_std": 1.333330750465393, "rewards/cargo_build_reward": 0.78125, "rewards/cargo_clippy_reward": 0.78125, "rewards/cargo_test_reward": 0.53125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8359375, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 295.0625, "completions/mean_terminated_length": 295.0625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.10835030549898167, "grad_norm": 0.08622051055470983, "kl": 0.113525390625, "learning_rate": 1e-06, "loss": 0.0019, "num_tokens": 7232843.0, "reward": 6.0859375, "reward_std": 0.3644236922264099, "rewards/cargo_build_reward": 0.96875, "rewards/cargo_clippy_reward": 0.96875, "rewards/cargo_test_reward": 0.59375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9609375, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 248.40625, "completions/mean_terminated_length": 248.40625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.10875763747454176, "grad_norm": 0.0884616838385181, "kl": 0.23699951171875, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 7255488.0, "reward": 6.1171875, "reward_std": 0.9955595135688782, "rewards/cargo_build_reward": 0.9375, "rewards/cargo_clippy_reward": 0.9375, "rewards/cargo_test_reward": 0.6875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8671875, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 882.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 401.09375, "completions/mean_terminated_length": 401.09375, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.10916496945010183, "grad_norm": 0.15097697805078084, "kl": 0.169158935546875, "learning_rate": 1e-06, "loss": -0.0061, "num_tokens": 7282555.0, "reward": 4.890625, "reward_std": 1.341402292251587, "rewards/cargo_build_reward": 0.75, "rewards/cargo_clippy_reward": 0.75, "rewards/cargo_test_reward": 0.21875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.953125, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 352.65625, "completions/mean_terminated_length": 352.65625, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.10957230142566192, "grad_norm": 0.1561985323559366, "kl": 0.1107177734375, "learning_rate": 1e-06, "loss": 0.0128, "num_tokens": 7308656.0, "reward": 4.765625, "reward_std": 1.2907824516296387, "rewards/cargo_build_reward": 0.625, "rewards/cargo_clippy_reward": 0.625, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.765625, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 326.4375, "completions/mean_terminated_length": 326.4375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.109979633401222, "grad_norm": 0.17435205466011472, "kl": 0.2305908203125, "learning_rate": 1e-06, "loss": 0.0046, "num_tokens": 7334062.0, "reward": 4.9609375, "reward_std": 1.5009129047393799, "rewards/cargo_build_reward": 0.75, "rewards/cargo_clippy_reward": 0.75, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7109375, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 441.21875, "completions/mean_terminated_length": 441.21875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.11038696537678208, "grad_norm": 0.16387973109810397, "kl": 0.10418701171875, "learning_rate": 1e-06, "loss": 0.0035, "num_tokens": 7363685.0, "reward": 4.5, "reward_std": 1.1646424531936646, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9375, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 268.625, "completions/mean_terminated_length": 268.625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.11079429735234216, "grad_norm": 0.1119476393435259, "kl": 0.447021484375, "learning_rate": 1e-06, "loss": -0.0036, "num_tokens": 7387521.0, "reward": 4.546875, "reward_std": 1.0633394718170166, "rewards/cargo_build_reward": 0.625, "rewards/cargo_clippy_reward": 0.625, "rewards/cargo_test_reward": 0.25, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.796875, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 382.90625, "completions/mean_terminated_length": 382.90625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.11120162932790224, "grad_norm": 0.1262634028903118, "kl": 0.6927490234375, "learning_rate": 1e-06, "loss": 0.0014, "num_tokens": 7414870.0, "reward": 4.5625, "reward_std": 0.5277684926986694, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.0, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8125, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 357.34375, "completions/mean_terminated_length": 357.34375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.11160896130346232, "grad_norm": 4.553639124987937, "kl": 54.0347900390625, "learning_rate": 1e-06, "loss": 0.0041, "num_tokens": 7441273.0, "reward": 5.125, "reward_std": 1.1450188159942627, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.3125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8125, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 334.3125, "completions/mean_terminated_length": 345.0967741935484, "completions/min_length": 0.0, "completions/min_terminated_length": 224.0, "epoch": 0.1120162932790224, "grad_norm": 1.8865599360062382, "kl": 26.4600830078125, "learning_rate": 1e-06, "loss": -0.0058, "num_tokens": 7471175.0, "reward": 4.328125, "reward_std": 1.3908053636550903, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.1875, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.703125, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 823.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 467.71875, "completions/mean_terminated_length": 467.71875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.11242362525458248, "grad_norm": 0.14878318328382348, "kl": 0.144287109375, "learning_rate": 1e-06, "loss": 0.0033, "num_tokens": 7501630.0, "reward": 4.5, "reward_std": 1.01895272731781, "rewards/cargo_build_reward": 0.59375, "rewards/cargo_clippy_reward": 0.625, "rewards/cargo_test_reward": 0.1875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.90625, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1442.0, "completions/max_terminated_length": 1442.0, "completions/mean_length": 625.5, "completions/mean_terminated_length": 625.5, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.11283095723014257, "grad_norm": 0.16104492485108915, "kl": 0.12481689453125, "learning_rate": 1e-06, "loss": 0.0026, "num_tokens": 7536238.0, "reward": 3.953125, "reward_std": 1.168134093284607, "rewards/cargo_build_reward": 0.46875, "rewards/cargo_clippy_reward": 0.46875, "rewards/cargo_test_reward": 0.0625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.90625, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1489.0, "completions/max_terminated_length": 1489.0, "completions/mean_length": 455.0625, "completions/mean_terminated_length": 455.0625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.11323828920570264, "grad_norm": 0.19205069080876094, "kl": 0.22412109375, "learning_rate": 1e-06, "loss": -0.0088, "num_tokens": 7566120.0, "reward": 4.09375, "reward_std": 1.2660152912139893, "rewards/cargo_build_reward": 0.53125, "rewards/cargo_clippy_reward": 0.53125, "rewards/cargo_test_reward": 0.09375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.84375, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 282.9375, "completions/mean_terminated_length": 282.9375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.11364562118126273, "grad_norm": 0.1483138613685051, "kl": 0.246337890625, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 7589582.0, "reward": 5.0234375, "reward_std": 1.3213204145431519, "rewards/cargo_build_reward": 0.78125, "rewards/cargo_clippy_reward": 0.78125, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.7265625, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 321.71875, "completions/mean_terminated_length": 321.71875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.11405295315682282, "grad_norm": 0.11701109651705693, "kl": 0.1343994140625, "learning_rate": 1e-06, "loss": 0.0033, "num_tokens": 7615005.0, "reward": 5.6953125, "reward_std": 0.9445090293884277, "rewards/cargo_build_reward": 0.78125, "rewards/cargo_clippy_reward": 0.78125, "rewards/cargo_test_reward": 0.625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8828125, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 323.8125, "completions/mean_terminated_length": 323.8125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.11446028513238289, "grad_norm": 0.08937033137104666, "kl": 0.0780029296875, "learning_rate": 1e-06, "loss": -0.0012, "num_tokens": 7640079.0, "reward": 5.6484375, "reward_std": 0.7428663372993469, "rewards/cargo_build_reward": 0.9375, "rewards/cargo_clippy_reward": 0.9375, "rewards/cargo_test_reward": 0.4375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8984375, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 221.875, "completions/mean_terminated_length": 221.875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.11486761710794298, "grad_norm": 0.106298534495795, "kl": 0.39532470703125, "learning_rate": 1e-06, "loss": 0.0052, "num_tokens": 7661979.0, "reward": 5.75, "reward_std": 1.0583428144454956, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.59375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.75, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 403.3125, "completions/mean_terminated_length": 403.3125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.11527494908350305, "grad_norm": 0.180058683124987, "kl": 0.06756591796875, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 7689765.0, "reward": 5.15625, "reward_std": 1.1191527843475342, "rewards/cargo_build_reward": 0.78125, "rewards/cargo_clippy_reward": 0.78125, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.890625, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 264.75, "completions/mean_terminated_length": 273.2903225806452, "completions/min_length": 0.0, "completions/min_terminated_length": 126.0, "epoch": 0.11568228105906314, "grad_norm": 0.17170194624800517, "kl": 2.3291015625, "learning_rate": 1e-06, "loss": -0.0113, "num_tokens": 7716337.0, "reward": 5.2109375, "reward_std": 1.13656485080719, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.8359375, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 379.5625, "completions/mean_terminated_length": 379.5625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.11608961303462322, "grad_norm": 0.13016667132985651, "kl": 0.38629150390625, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 7744059.0, "reward": 4.5078125, "reward_std": 1.093302607536316, "rewards/cargo_build_reward": 0.6875, "rewards/cargo_clippy_reward": 0.6875, "rewards/cargo_test_reward": 0.15625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8203125, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 324.125, "completions/mean_terminated_length": 324.125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.1164969450101833, "grad_norm": 0.13264994720706488, "kl": 0.1514892578125, "learning_rate": 1e-06, "loss": 0.018, "num_tokens": 7770631.0, "reward": 5.15625, "reward_std": 1.1100351810455322, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.71875, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 350.28125, "completions/mean_terminated_length": 350.28125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.11690427698574338, "grad_norm": 0.14118972283961445, "kl": 0.1871337890625, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 7796424.0, "reward": 4.578125, "reward_std": 1.092275619506836, "rewards/cargo_build_reward": 0.625, "rewards/cargo_clippy_reward": 0.625, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.6875, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 302.90625, "completions/mean_terminated_length": 302.90625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.11731160896130347, "grad_norm": 0.1515250622599137, "kl": 0.2572021484375, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 7820885.0, "reward": 5.53125, "reward_std": 1.1256417036056519, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.78125, "rewards/cargo_test_reward": 0.625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.6875, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 335.5, "completions/mean_terminated_length": 335.5, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.11771894093686354, "grad_norm": 0.1885785813799223, "kl": 0.2010498046875, "learning_rate": 1e-06, "loss": 0.0049, "num_tokens": 7845949.0, "reward": 5.046875, "reward_std": 1.400909185409546, "rewards/cargo_build_reward": 0.75, "rewards/cargo_clippy_reward": 0.75, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.796875, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 349.03125, "completions/mean_terminated_length": 349.03125, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.11812627291242363, "grad_norm": 0.15741667715465132, "kl": 0.1563720703125, "learning_rate": 1e-06, "loss": 0.0025, "num_tokens": 7871958.0, "reward": 4.7578125, "reward_std": 1.3605859279632568, "rewards/cargo_build_reward": 0.625, "rewards/cargo_clippy_reward": 0.625, "rewards/cargo_test_reward": 0.3125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8828125, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 374.65625, "completions/mean_terminated_length": 374.65625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.1185336048879837, "grad_norm": 0.16564801145243255, "kl": 0.73828125, "learning_rate": 1e-06, "loss": -0.0035, "num_tokens": 7899091.0, "reward": 4.3203125, "reward_std": 1.1249115467071533, "rewards/cargo_build_reward": 0.46875, "rewards/cargo_clippy_reward": 0.46875, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8203125, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1136.0, "completions/max_terminated_length": 1136.0, "completions/mean_length": 460.03125, "completions/mean_terminated_length": 460.03125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.11894093686354379, "grad_norm": 0.9810150213275214, "kl": 8.68408203125, "learning_rate": 1e-06, "loss": 0.007, "num_tokens": 7929436.0, "reward": 4.6640625, "reward_std": 1.204304814338684, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7890625, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 333.25, "completions/mean_terminated_length": 333.25, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.11934826883910386, "grad_norm": 0.13758647833349244, "kl": 0.2254638671875, "learning_rate": 1e-06, "loss": 0.0032, "num_tokens": 7955676.0, "reward": 5.0234375, "reward_std": 1.1061217784881592, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.75, "rewards/cargo_test_reward": 0.3125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7734375, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 334.875, "completions/mean_terminated_length": 334.875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.11975560081466395, "grad_norm": 0.32509081149452707, "kl": 4.1324462890625, "learning_rate": 1e-06, "loss": 0.0102, "num_tokens": 7981536.0, "reward": 5.1484375, "reward_std": 1.2189942598342896, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.8515625, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 940.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 514.75, "completions/mean_terminated_length": 514.75, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.12016293279022404, "grad_norm": 0.16648108780140852, "kl": 0.33056640625, "learning_rate": 1e-06, "loss": -0.0022, "num_tokens": 8012952.0, "reward": 4.765625, "reward_std": 1.218648910522461, "rewards/cargo_build_reward": 0.71875, "rewards/cargo_clippy_reward": 0.71875, "rewards/cargo_test_reward": 0.21875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.890625, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 295.84375, "completions/mean_terminated_length": 305.38709677419354, "completions/min_length": 0.0, "completions/min_terminated_length": 209.0, "epoch": 0.12057026476578411, "grad_norm": 0.11883015365571581, "kl": 0.2327880859375, "learning_rate": 1e-06, "loss": -0.0151, "num_tokens": 8041419.0, "reward": 5.375, "reward_std": 0.748497724533081, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.40625, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.8125, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 289.34375, "completions/mean_terminated_length": 289.34375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.1209775967413442, "grad_norm": 0.10458427429280603, "kl": 0.536865234375, "learning_rate": 1e-06, "loss": -0.0069, "num_tokens": 8065958.0, "reward": 4.609375, "reward_std": 0.9699586033821106, "rewards/cargo_build_reward": 0.75, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.3125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.578125, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 492.1875, "completions/mean_terminated_length": 492.1875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.12138492871690428, "grad_norm": 0.14186188295209, "kl": 0.09375, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 8097228.0, "reward": 4.8671875, "reward_std": 1.137942910194397, "rewards/cargo_build_reward": 0.71875, "rewards/cargo_clippy_reward": 0.71875, "rewards/cargo_test_reward": 0.3125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8046875, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 321.15625, "completions/mean_terminated_length": 321.15625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.12179226069246436, "grad_norm": 0.09821785497127014, "kl": 0.302734375, "learning_rate": 1e-06, "loss": 0.0014, "num_tokens": 8121425.0, "reward": 5.4453125, "reward_std": 0.7561496496200562, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.46875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8203125, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 346.0625, "completions/mean_terminated_length": 346.0625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.12219959266802444, "grad_norm": 0.17510460830680546, "kl": 0.271728515625, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 8147603.0, "reward": 4.9453125, "reward_std": 1.1475259065628052, "rewards/cargo_build_reward": 0.78125, "rewards/cargo_clippy_reward": 0.78125, "rewards/cargo_test_reward": 0.25, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8828125, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 693.0, "completions/max_terminated_length": 693.0, "completions/mean_length": 305.59375, "completions/mean_terminated_length": 305.59375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.12260692464358453, "grad_norm": 0.12321578572372362, "kl": 0.157958984375, "learning_rate": 1e-06, "loss": 0.0021, "num_tokens": 8172142.0, "reward": 4.53125, "reward_std": 0.9470474123954773, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.9375, "rewards/tests_have_asserts_reward": 0.71875, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 368.46875, "completions/mean_terminated_length": 368.46875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.1230142566191446, "grad_norm": 0.14119045615995254, "kl": 0.05419921875, "learning_rate": 1e-06, "loss": 0.0085, "num_tokens": 8199053.0, "reward": 5.25, "reward_std": 0.6313312649726868, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.3125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.875, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 291.25, "completions/mean_terminated_length": 291.25, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.12342158859470469, "grad_norm": 0.1190369601319705, "kl": 0.06005859375, "learning_rate": 1e-06, "loss": 0.0013, "num_tokens": 8223021.0, "reward": 5.34375, "reward_std": 1.0053589344024658, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.84375, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 729.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 278.59375, "completions/mean_terminated_length": 278.59375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.12382892057026476, "grad_norm": 0.152233616540164, "kl": 1.1922607421875, "learning_rate": 1e-06, "loss": -0.0056, "num_tokens": 8246048.0, "reward": 5.7890625, "reward_std": 0.576694667339325, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.59375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9140625, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 233.9375, "completions/mean_terminated_length": 233.9375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.12423625254582485, "grad_norm": 0.08187401157892721, "kl": 0.1328125, "learning_rate": 1e-06, "loss": -0.0029, "num_tokens": 8268582.0, "reward": 6.125, "reward_std": 0.8295804858207703, "rewards/cargo_build_reward": 1.0, "rewards/cargo_clippy_reward": 1.0, "rewards/cargo_test_reward": 0.6875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.75, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 295.46875, "completions/mean_terminated_length": 295.46875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.12464358452138492, "grad_norm": 0.11004194557650704, "kl": 0.040069580078125, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 8293229.0, "reward": 6.21875, "reward_std": 0.8219360113143921, "rewards/cargo_build_reward": 0.96875, "rewards/cargo_clippy_reward": 0.96875, "rewards/cargo_test_reward": 0.71875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.84375, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 920.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 361.375, "completions/mean_terminated_length": 361.375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.125050916496945, "grad_norm": 0.10735721194718409, "kl": 0.5159912109375, "learning_rate": 1e-06, "loss": 0.0039, "num_tokens": 8319873.0, "reward": 5.25, "reward_std": 0.7978966236114502, "rewards/cargo_build_reward": 0.9375, "rewards/cargo_clippy_reward": 0.9375, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.6875, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 422.0, "completions/mean_terminated_length": 422.0, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.12545824847250509, "grad_norm": 0.5269721152618081, "kl": 4.53875732421875, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 8348729.0, "reward": 4.984375, "reward_std": 1.289331316947937, "rewards/cargo_build_reward": 0.75, "rewards/cargo_clippy_reward": 0.75, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.921875, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 323.15625, "completions/mean_terminated_length": 323.15625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.12586558044806517, "grad_norm": 0.14390950392071175, "kl": 0.1331787109375, "learning_rate": 1e-06, "loss": 0.0029, "num_tokens": 8373958.0, "reward": 5.1796875, "reward_std": 0.7235574722290039, "rewards/cargo_build_reward": 0.96875, "rewards/cargo_clippy_reward": 0.96875, "rewards/cargo_test_reward": 0.21875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.8203125, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 356.75, "completions/mean_terminated_length": 356.75, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.12627291242362526, "grad_norm": 0.13907683225817433, "kl": 0.04400634765625, "learning_rate": 1e-06, "loss": -0.0061, "num_tokens": 8400790.0, "reward": 5.390625, "reward_std": 1.155860185623169, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.984375, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 297.96875, "completions/mean_terminated_length": 297.96875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.12668024439918535, "grad_norm": 0.08076983390044086, "kl": 0.21533203125, "learning_rate": 1e-06, "loss": -0.002, "num_tokens": 8425317.0, "reward": 5.8359375, "reward_std": 0.5163742303848267, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.71875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7421875, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 312.65625, "completions/mean_terminated_length": 312.65625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.1270875763747454, "grad_norm": 0.14598538570078823, "kl": 0.095458984375, "learning_rate": 1e-06, "loss": 0.0057, "num_tokens": 8450338.0, "reward": 4.7265625, "reward_std": 0.9958631992340088, "rewards/cargo_build_reward": 0.78125, "rewards/cargo_clippy_reward": 0.78125, "rewards/cargo_test_reward": 0.125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9140625, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 269.09375, "completions/mean_terminated_length": 269.09375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.1274949083503055, "grad_norm": 0.148913448446011, "kl": 0.2449951171875, "learning_rate": 1e-06, "loss": 0.0014, "num_tokens": 8473789.0, "reward": 5.484375, "reward_std": 1.3874802589416504, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.5, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.734375, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 893.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 344.84375, "completions/mean_terminated_length": 344.84375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.12790224032586558, "grad_norm": 0.1460142084296301, "kl": 0.1802978515625, "learning_rate": 1e-06, "loss": 0.0011, "num_tokens": 8499832.0, "reward": 5.7421875, "reward_std": 0.72386634349823, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.6875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.6796875, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 953.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 406.53125, "completions/mean_terminated_length": 406.53125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.12830957230142567, "grad_norm": 0.15246647434109337, "kl": 0.2283935546875, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 8527793.0, "reward": 4.6640625, "reward_std": 1.1216604709625244, "rewards/cargo_build_reward": 0.75, "rewards/cargo_clippy_reward": 0.71875, "rewards/cargo_test_reward": 0.21875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7578125, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 342.34375, "completions/mean_terminated_length": 342.34375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.12871690427698573, "grad_norm": 0.1582283452943812, "kl": 0.1551513671875, "learning_rate": 1e-06, "loss": 0.003, "num_tokens": 8554076.0, "reward": 5.3359375, "reward_std": 1.1304055452346802, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.4375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7734375, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1441.0, "completions/max_terminated_length": 1441.0, "completions/mean_length": 500.71875, "completions/mean_terminated_length": 500.71875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.12912423625254582, "grad_norm": 0.1841685716627463, "kl": 0.128814697265625, "learning_rate": 1e-06, "loss": 0.0082, "num_tokens": 8585875.0, "reward": 4.828125, "reward_std": 1.1464003324508667, "rewards/cargo_build_reward": 0.71875, "rewards/cargo_clippy_reward": 0.71875, "rewards/cargo_test_reward": 0.3125, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.78125, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 403.90625, "completions/mean_terminated_length": 403.90625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.1295315682281059, "grad_norm": 0.17590877975865102, "kl": 0.82269287109375, "learning_rate": 1e-06, "loss": 0.0019, "num_tokens": 8614400.0, "reward": 5.015625, "reward_std": 1.1951438188552856, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.25, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.890625, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 812.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 428.0, "completions/mean_terminated_length": 428.0, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.129938900203666, "grad_norm": 0.15800969865178677, "kl": 0.3536376953125, "learning_rate": 1e-06, "loss": 0.0045, "num_tokens": 8642992.0, "reward": 4.6796875, "reward_std": 1.075789213180542, "rewards/cargo_build_reward": 0.75, "rewards/cargo_clippy_reward": 0.75, "rewards/cargo_test_reward": 0.125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9296875, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 963.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 574.84375, "completions/mean_terminated_length": 574.84375, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.13034623217922606, "grad_norm": 0.16975018817715407, "kl": 0.6793212890625, "learning_rate": 1e-06, "loss": 0.0015, "num_tokens": 8676963.0, "reward": 3.9140625, "reward_std": 1.0297878980636597, "rewards/cargo_build_reward": 0.5, "rewards/cargo_clippy_reward": 0.5, "rewards/cargo_test_reward": 0.0, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9140625, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 851.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 295.84375, "completions/mean_terminated_length": 295.84375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.13075356415478614, "grad_norm": 0.12735954359973728, "kl": 0.1572265625, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 8700990.0, "reward": 5.609375, "reward_std": 1.1104497909545898, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.5, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.921875, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 365.78125, "completions/mean_terminated_length": 365.78125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.13116089613034623, "grad_norm": 0.1910300889247685, "kl": 1.3275146484375, "learning_rate": 1e-06, "loss": 0.0156, "num_tokens": 8727079.0, "reward": 5.046875, "reward_std": 1.2593364715576172, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.6875, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.984375, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1026.0, "completions/max_terminated_length": 1026.0, "completions/mean_length": 478.21875, "completions/mean_terminated_length": 478.21875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.13156822810590632, "grad_norm": 0.09355194134232056, "kl": 0.1529541015625, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 8757606.0, "reward": 4.1484375, "reward_std": 0.6420546174049377, "rewards/cargo_build_reward": 0.53125, "rewards/cargo_clippy_reward": 0.53125, "rewards/cargo_test_reward": 0.15625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7734375, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 289.4375, "completions/mean_terminated_length": 289.4375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.1319755600814664, "grad_norm": 0.11677201059195703, "kl": 0.129638671875, "learning_rate": 1e-06, "loss": -0.0024, "num_tokens": 8782420.0, "reward": 4.953125, "reward_std": 0.6945017576217651, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.765625, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1687.0, "completions/max_terminated_length": 1687.0, "completions/mean_length": 491.75, "completions/mean_terminated_length": 491.75, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.13238289205702647, "grad_norm": 0.15681318631924113, "kl": 0.09246826171875, "learning_rate": 1e-06, "loss": 0.0229, "num_tokens": 8813252.0, "reward": 4.15625, "reward_std": 0.9840061664581299, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.0625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.71875, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 828.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 395.15625, "completions/mean_terminated_length": 395.15625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.13279022403258656, "grad_norm": 0.0818899407346736, "kl": 0.06695556640625, "learning_rate": 1e-06, "loss": 0.0031, "num_tokens": 8841049.0, "reward": 4.5078125, "reward_std": 0.7758102416992188, "rewards/cargo_build_reward": 0.6875, "rewards/cargo_clippy_reward": 0.6875, "rewards/cargo_test_reward": 0.15625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8203125, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 294.65625, "completions/mean_terminated_length": 294.65625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.13319755600814664, "grad_norm": 0.11106359578436034, "kl": 0.0850830078125, "learning_rate": 1e-06, "loss": 0.0034, "num_tokens": 8865686.0, "reward": 5.0078125, "reward_std": 0.8287466764450073, "rewards/cargo_build_reward": 0.9375, "rewards/cargo_clippy_reward": 0.9375, "rewards/cargo_test_reward": 0.21875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.6953125, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 382.1875, "completions/mean_terminated_length": 382.1875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.13360488798370673, "grad_norm": 0.2628942425661649, "kl": 0.08209228515625, "learning_rate": 1e-06, "loss": 0.0028, "num_tokens": 8893004.0, "reward": 5.0546875, "reward_std": 1.4586012363433838, "rewards/cargo_build_reward": 0.75, "rewards/cargo_clippy_reward": 0.75, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8671875, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1137.0, "completions/max_terminated_length": 1137.0, "completions/mean_length": 447.625, "completions/mean_terminated_length": 462.06451612903226, "completions/min_length": 0.0, "completions/min_terminated_length": 131.0, "epoch": 0.1340122199592668, "grad_norm": 0.14008020197789925, "kl": 0.04327392578125, "learning_rate": 1e-06, "loss": -0.0257, "num_tokens": 8926071.0, "reward": 5.3671875, "reward_std": 0.9594376087188721, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.4375, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.8671875, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 286.6875, "completions/mean_terminated_length": 286.6875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.13441955193482688, "grad_norm": 0.08983797578932191, "kl": 0.75439453125, "learning_rate": 1e-06, "loss": 0.0019, "num_tokens": 8950333.0, "reward": 5.859375, "reward_std": 0.7503848075866699, "rewards/cargo_build_reward": 0.9375, "rewards/cargo_clippy_reward": 0.9375, "rewards/cargo_test_reward": 0.5, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.984375, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 433.5625, "completions/mean_terminated_length": 433.5625, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.13482688391038697, "grad_norm": 0.18433011032100935, "kl": 0.2506103515625, "learning_rate": 1e-06, "loss": 0.0015, "num_tokens": 8979783.0, "reward": 4.296875, "reward_std": 1.180513620376587, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.734375, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 249.09375, "completions/mean_terminated_length": 249.74193548387098, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.13523421588594706, "grad_norm": 0.11900292517574411, "kl": 0.2587890625, "learning_rate": 1e-06, "loss": -0.0067, "num_tokens": 9003378.0, "reward": 5.8984375, "reward_std": 0.7815060615539551, "rewards/cargo_build_reward": 0.96875, "rewards/cargo_clippy_reward": 0.96875, "rewards/cargo_test_reward": 0.6875, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.9375, "rewards/test_block_count_reward": 0.9375, "rewards/tests_have_asserts_reward": 0.6953125, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 280.5, "completions/mean_terminated_length": 280.5, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.13564154786150712, "grad_norm": 0.15159153025833763, "kl": 0.3828125, "learning_rate": 1e-06, "loss": 0.0092, "num_tokens": 9028162.0, "reward": 5.1484375, "reward_std": 1.1171514987945557, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.40625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.5859375, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1117.0, "completions/max_terminated_length": 1117.0, "completions/mean_length": 367.28125, "completions/mean_terminated_length": 367.28125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.1360488798370672, "grad_norm": 0.10697897189105052, "kl": 0.24005126953125, "learning_rate": 1e-06, "loss": 0.0066, "num_tokens": 9055331.0, "reward": 4.46875, "reward_std": 1.0055537223815918, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.1875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.78125, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 391.46875, "completions/mean_terminated_length": 391.46875, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.1364562118126273, "grad_norm": 0.1396713879603593, "kl": 0.1015625, "learning_rate": 1e-06, "loss": -0.0046, "num_tokens": 9083442.0, "reward": 4.421875, "reward_std": 1.1342871189117432, "rewards/cargo_build_reward": 0.5625, "rewards/cargo_clippy_reward": 0.5625, "rewards/cargo_test_reward": 0.1875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.921875, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 435.15625, "completions/mean_terminated_length": 435.15625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.13686354378818738, "grad_norm": 0.17972755364640292, "kl": 0.1123046875, "learning_rate": 1e-06, "loss": -0.0033, "num_tokens": 9112679.0, "reward": 5.0390625, "reward_std": 1.356706142425537, "rewards/cargo_build_reward": 0.75, "rewards/cargo_clippy_reward": 0.75, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8515625, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 366.65625, "completions/mean_terminated_length": 366.65625, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.13727087576374744, "grad_norm": 0.11405749313429404, "kl": 0.12042236328125, "learning_rate": 1e-06, "loss": 0.0032, "num_tokens": 9139172.0, "reward": 5.796875, "reward_std": 1.0548584461212158, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.5, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.984375, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1255.0, "completions/max_terminated_length": 1255.0, "completions/mean_length": 461.875, "completions/mean_terminated_length": 461.875, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.13767820773930753, "grad_norm": 0.13903505250638792, "kl": 0.2977294921875, "learning_rate": 1e-06, "loss": 0.0029, "num_tokens": 9169560.0, "reward": 3.6171875, "reward_std": 0.8829737305641174, "rewards/cargo_build_reward": 0.34375, "rewards/cargo_clippy_reward": 0.28125, "rewards/cargo_test_reward": 0.03125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9296875, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 742.0, "completions/max_terminated_length": 742.0, "completions/mean_length": 355.875, "completions/mean_terminated_length": 355.875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.13808553971486762, "grad_norm": 0.16386479710192556, "kl": 0.55517578125, "learning_rate": 1e-06, "loss": -0.008, "num_tokens": 9195884.0, "reward": 5.4453125, "reward_std": 1.189713954925537, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.53125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.90625, "rewards/tests_have_asserts_reward": 0.7421875, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 759.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 429.625, "completions/mean_terminated_length": 429.625, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.1384928716904277, "grad_norm": 0.16134912197389864, "kl": 0.1817626953125, "learning_rate": 1e-06, "loss": 0.0069, "num_tokens": 9223960.0, "reward": 5.1484375, "reward_std": 1.121705412864685, "rewards/cargo_build_reward": 0.78125, "rewards/cargo_clippy_reward": 0.78125, "rewards/cargo_test_reward": 0.3125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9609375, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 866.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 369.125, "completions/mean_terminated_length": 369.125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.1389002036659878, "grad_norm": 0.14456161106416965, "kl": 0.25439453125, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 9251340.0, "reward": 5.09375, "reward_std": 1.045795202255249, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.71875, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 282.96875, "completions/mean_terminated_length": 282.96875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.13930753564154785, "grad_norm": 0.07830169143624639, "kl": 0.06414794921875, "learning_rate": 1e-06, "loss": 0.002, "num_tokens": 9275587.0, "reward": 5.7421875, "reward_std": 0.7753755450248718, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.53125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9296875, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 324.46875, "completions/mean_terminated_length": 324.46875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.13971486761710794, "grad_norm": 0.18460081439328185, "kl": 2.6995849609375, "learning_rate": 1e-06, "loss": 0.0066, "num_tokens": 9300674.0, "reward": 4.9140625, "reward_std": 1.3185405731201172, "rewards/cargo_build_reward": 0.6875, "rewards/cargo_clippy_reward": 0.6875, "rewards/cargo_test_reward": 0.40625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7265625, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1234.0, "completions/max_terminated_length": 1234.0, "completions/mean_length": 355.90625, "completions/mean_terminated_length": 355.90625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.14012219959266803, "grad_norm": 0.09901975986434068, "kl": 0.2659912109375, "learning_rate": 1e-06, "loss": -0.0034, "num_tokens": 9326943.0, "reward": 5.6484375, "reward_std": 0.6417502164840698, "rewards/cargo_build_reward": 0.9375, "rewards/cargo_clippy_reward": 0.9375, "rewards/cargo_test_reward": 0.46875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8359375, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 740.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 366.71875, "completions/mean_terminated_length": 366.71875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.14052953156822812, "grad_norm": 0.09331007976098271, "kl": 0.1285400390625, "learning_rate": 1e-06, "loss": -0.0011, "num_tokens": 9354206.0, "reward": 5.6171875, "reward_std": 0.5458313226699829, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.4375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9296875, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 440.0625, "completions/mean_terminated_length": 440.0625, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.14093686354378818, "grad_norm": 0.14630331349890233, "kl": 0.346435546875, "learning_rate": 1e-06, "loss": 0.0068, "num_tokens": 9383000.0, "reward": 4.734375, "reward_std": 1.0637853145599365, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.859375, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 381.65625, "completions/mean_terminated_length": 381.65625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.14134419551934826, "grad_norm": 0.17331226749699366, "kl": 0.6561279296875, "learning_rate": 1e-06, "loss": 0.0043, "num_tokens": 9410541.0, "reward": 4.4921875, "reward_std": 1.2115974426269531, "rewards/cargo_build_reward": 0.78125, "rewards/cargo_clippy_reward": 0.78125, "rewards/cargo_test_reward": 0.15625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.6171875, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 323.21875, "completions/mean_terminated_length": 322.2258064516129, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.14175152749490835, "grad_norm": 0.16129291860470463, "kl": 0.4150390625, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 9435276.0, "reward": 4.765625, "reward_std": 1.3257715702056885, "rewards/cargo_build_reward": 0.75, "rewards/cargo_clippy_reward": 0.75, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.703125, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 392.125, "completions/mean_terminated_length": 392.125, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.14215885947046844, "grad_norm": 0.24621323411601634, "kl": 0.520263671875, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 9462464.0, "reward": 5.3671875, "reward_std": 1.42042875289917, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.40625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.8828125, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1103.0, "completions/max_terminated_length": 1103.0, "completions/mean_length": 400.78125, "completions/mean_terminated_length": 400.78125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.1425661914460285, "grad_norm": 0.129682090022344, "kl": 0.437744140625, "learning_rate": 1e-06, "loss": 0.0025, "num_tokens": 9490625.0, "reward": 5.5859375, "reward_std": 0.9094717502593994, "rewards/cargo_build_reward": 0.78125, "rewards/cargo_clippy_reward": 0.78125, "rewards/cargo_test_reward": 0.59375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8359375, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 352.46875, "completions/mean_terminated_length": 352.46875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.1429735234215886, "grad_norm": 0.09688786730480363, "kl": 0.23370361328125, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 9516376.0, "reward": 5.515625, "reward_std": 0.7535701990127563, "rewards/cargo_build_reward": 0.9375, "rewards/cargo_clippy_reward": 0.9375, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.953125, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 292.625, "completions/mean_terminated_length": 292.625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.14338085539714868, "grad_norm": 0.08622417964860002, "kl": 0.849609375, "learning_rate": 1e-06, "loss": 0.0025, "num_tokens": 9541268.0, "reward": 5.8515625, "reward_std": 0.37146395444869995, "rewards/cargo_build_reward": 0.96875, "rewards/cargo_clippy_reward": 0.96875, "rewards/cargo_test_reward": 0.5625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7890625, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 338.96875, "completions/mean_terminated_length": 338.96875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.14378818737270876, "grad_norm": 0.14260709357338208, "kl": 0.534912109375, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 9567107.0, "reward": 5.1875, "reward_std": 1.0232089757919312, "rewards/cargo_build_reward": 0.71875, "rewards/cargo_clippy_reward": 0.71875, "rewards/cargo_test_reward": 0.46875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8125, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 275.34375, "completions/mean_terminated_length": 275.34375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.14419551934826885, "grad_norm": 0.10357169198308107, "kl": 0.74462890625, "learning_rate": 1e-06, "loss": -0.0022, "num_tokens": 9591462.0, "reward": 5.9296875, "reward_std": 0.9630800485610962, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8671875, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 309.15625, "completions/mean_terminated_length": 309.15625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.1446028513238289, "grad_norm": 0.09464501278494115, "kl": 0.219482421875, "learning_rate": 1e-06, "loss": -0.0039, "num_tokens": 9616315.0, "reward": 5.6171875, "reward_std": 0.6173626184463501, "rewards/cargo_build_reward": 1.0, "rewards/cargo_clippy_reward": 1.0, "rewards/cargo_test_reward": 0.40625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8046875, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 982.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 376.6875, "completions/mean_terminated_length": 376.6875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.145010183299389, "grad_norm": 1.8299172629926406, "kl": 13.1044921875, "learning_rate": 1e-06, "loss": 0.013, "num_tokens": 9643193.0, "reward": 5.875, "reward_std": 1.1920604705810547, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9375, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 221.25, "completions/mean_terminated_length": 221.25, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.1454175152749491, "grad_norm": 0.062390222221837544, "kl": 0.460693359375, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 9664665.0, "reward": 6.1640625, "reward_std": 0.5787454843521118, "rewards/cargo_build_reward": 1.0, "rewards/cargo_clippy_reward": 1.0, "rewards/cargo_test_reward": 0.65625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8515625, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 379.125, "completions/mean_terminated_length": 379.125, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.14582484725050918, "grad_norm": 0.19469566990993933, "kl": 1.071533203125, "learning_rate": 1e-06, "loss": 0.0034, "num_tokens": 9692541.0, "reward": 4.609375, "reward_std": 1.0437216758728027, "rewards/cargo_build_reward": 0.78125, "rewards/cargo_clippy_reward": 0.78125, "rewards/cargo_test_reward": 0.125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.796875, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 336.625, "completions/mean_terminated_length": 336.625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.14623217922606924, "grad_norm": 0.11000479251888631, "kl": 0.231689453125, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 9718561.0, "reward": 5.0625, "reward_std": 0.8462489247322083, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.15625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9375, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1347.0, "completions/max_terminated_length": 1347.0, "completions/mean_length": 366.6875, "completions/mean_terminated_length": 366.6875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.14663951120162932, "grad_norm": 0.2937439499832218, "kl": 0.7568359375, "learning_rate": 1e-06, "loss": 0.0305, "num_tokens": 9746159.0, "reward": 4.515625, "reward_std": 1.369112253189087, "rewards/cargo_build_reward": 0.6875, "rewards/cargo_clippy_reward": 0.6875, "rewards/cargo_test_reward": 0.3125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.515625, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 368.15625, "completions/mean_terminated_length": 368.15625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.1470468431771894, "grad_norm": 0.13892517468110852, "kl": 0.875244140625, "learning_rate": 1e-06, "loss": 0.0043, "num_tokens": 9772972.0, "reward": 5.078125, "reward_std": 0.7367126941680908, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.734375, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 258.4375, "completions/mean_terminated_length": 258.4375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.1474541751527495, "grad_norm": 0.08422152222290788, "kl": 0.91064453125, "learning_rate": 1e-06, "loss": 0.0011, "num_tokens": 9796242.0, "reward": 6.015625, "reward_std": 0.6708080172538757, "rewards/cargo_build_reward": 0.96875, "rewards/cargo_clippy_reward": 0.96875, "rewards/cargo_test_reward": 0.5625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.953125, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 313.34375, "completions/mean_terminated_length": 313.34375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.14786150712830956, "grad_norm": 0.22427196336704966, "kl": 3.2813720703125, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 9822061.0, "reward": 5.6171875, "reward_std": 0.7913834452629089, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.5625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7421875, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1585.0, "completions/max_terminated_length": 1585.0, "completions/mean_length": 360.5625, "completions/mean_terminated_length": 360.5625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.14826883910386965, "grad_norm": 0.15382775868441903, "kl": 0.376708984375, "learning_rate": 1e-06, "loss": 0.0121, "num_tokens": 9848535.0, "reward": 5.1171875, "reward_std": 1.0124129056930542, "rewards/cargo_build_reward": 0.78125, "rewards/cargo_clippy_reward": 0.78125, "rewards/cargo_test_reward": 0.4375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.6796875, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 384.9375, "completions/mean_terminated_length": 397.35483870967744, "completions/min_length": 0.0, "completions/min_terminated_length": 185.0, "epoch": 0.14867617107942974, "grad_norm": 0.18905957917264707, "kl": 0.3074951171875, "learning_rate": 1e-06, "loss": -0.016, "num_tokens": 9879997.0, "reward": 5.2421875, "reward_std": 1.2076013088226318, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.8671875, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 335.15625, "completions/mean_terminated_length": 335.15625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.14908350305498982, "grad_norm": 0.23612644375747585, "kl": 0.3916015625, "learning_rate": 1e-06, "loss": 0.0105, "num_tokens": 9905938.0, "reward": 5.9140625, "reward_std": 0.9861429929733276, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.65625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7890625, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 317.65625, "completions/mean_terminated_length": 317.65625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.14949083503054988, "grad_norm": 0.1375577599975736, "kl": 0.35107421875, "learning_rate": 1e-06, "loss": 0.0015, "num_tokens": 9930687.0, "reward": 5.6953125, "reward_std": 0.8999901413917542, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.59375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7578125, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 376.75, "completions/mean_terminated_length": 376.75, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.14989816700610997, "grad_norm": 0.12707335770193073, "kl": 0.291748046875, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 9958063.0, "reward": 4.9453125, "reward_std": 0.9925188422203064, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.15625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9140625, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 821.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 409.84375, "completions/mean_terminated_length": 409.84375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.15030549898167006, "grad_norm": 334.0803513591208, "kl": 2434.5240478515625, "learning_rate": 1e-06, "loss": 0.2369, "num_tokens": 9986194.0, "reward": 5.4296875, "reward_std": 0.9298094511032104, "rewards/cargo_build_reward": 0.9375, "rewards/cargo_clippy_reward": 0.9375, "rewards/cargo_test_reward": 0.3125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9296875, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 350.125, "completions/mean_terminated_length": 350.125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.15071283095723015, "grad_norm": 0.13930721173294916, "kl": 0.2474365234375, "learning_rate": 1e-06, "loss": -0.0026, "num_tokens": 10012830.0, "reward": 4.625, "reward_std": 1.1380959749221802, "rewards/cargo_build_reward": 0.625, "rewards/cargo_clippy_reward": 0.625, "rewards/cargo_test_reward": 0.21875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9375, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 310.75, "completions/mean_terminated_length": 310.75, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.15112016293279024, "grad_norm": 0.1259032346915019, "kl": 0.7496337890625, "learning_rate": 1e-06, "loss": 0.0053, "num_tokens": 10036614.0, "reward": 5.59375, "reward_std": 0.8330329060554504, "rewards/cargo_build_reward": 0.9375, "rewards/cargo_clippy_reward": 0.9375, "rewards/cargo_test_reward": 0.40625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.90625, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 419.8125, "completions/mean_terminated_length": 419.8125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.1515274949083503, "grad_norm": 0.15775251764914489, "kl": 0.2373046875, "learning_rate": 1e-06, "loss": -0.0064, "num_tokens": 10065488.0, "reward": 4.453125, "reward_std": 1.140798807144165, "rewards/cargo_build_reward": 0.78125, "rewards/cargo_clippy_reward": 0.78125, "rewards/cargo_test_reward": 0.125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.640625, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 369.9375, "completions/mean_terminated_length": 369.9375, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.15193482688391038, "grad_norm": 0.18144745491964562, "kl": 0.1636962890625, "learning_rate": 1e-06, "loss": 0.0042, "num_tokens": 10092462.0, "reward": 5.296875, "reward_std": 1.3446197509765625, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.40625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.796875, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 351.40625, "completions/mean_terminated_length": 351.40625, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.15234215885947047, "grad_norm": 0.14296036170269852, "kl": 0.565185546875, "learning_rate": 1e-06, "loss": 0.0021, "num_tokens": 10118707.0, "reward": 5.5703125, "reward_std": 0.8960230350494385, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.5, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8828125, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 321.8125, "completions/mean_terminated_length": 321.8125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.15274949083503056, "grad_norm": 0.0571801373583109, "kl": 0.10498046875, "learning_rate": 1e-06, "loss": -0.003, "num_tokens": 10143893.0, "reward": 5.7109375, "reward_std": 0.4177277386188507, "rewards/cargo_build_reward": 0.96875, "rewards/cargo_clippy_reward": 0.96875, "rewards/cargo_test_reward": 0.46875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8359375, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 854.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 464.15625, "completions/mean_terminated_length": 464.15625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.15315682281059062, "grad_norm": 0.1904269614370811, "kl": 0.350341796875, "learning_rate": 1e-06, "loss": 0.0054, "num_tokens": 10173354.0, "reward": 3.7421875, "reward_std": 1.1803107261657715, "rewards/cargo_build_reward": 0.46875, "rewards/cargo_clippy_reward": 0.46875, "rewards/cargo_test_reward": 0.03125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.7578125, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 350.9375, "completions/mean_terminated_length": 350.9375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.1535641547861507, "grad_norm": 0.16806719214370533, "kl": 0.1827392578125, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 10200480.0, "reward": 4.890625, "reward_std": 1.2776925563812256, "rewards/cargo_build_reward": 0.78125, "rewards/cargo_clippy_reward": 0.78125, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.765625, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 339.5, "completions/mean_terminated_length": 339.5, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.1539714867617108, "grad_norm": 0.13956120189042864, "kl": 0.251953125, "learning_rate": 1e-06, "loss": 0.002, "num_tokens": 10226192.0, "reward": 4.7265625, "reward_std": 0.7777705192565918, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.0625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9140625, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 299.28125, "completions/mean_terminated_length": 308.93548387096774, "completions/min_length": 0.0, "completions/min_terminated_length": 84.0, "epoch": 0.15437881873727088, "grad_norm": 0.11156327089525053, "kl": 0.432373046875, "learning_rate": 1e-06, "loss": -0.0066, "num_tokens": 10254345.0, "reward": 4.046875, "reward_std": 0.6096374988555908, "rewards/cargo_build_reward": 0.625, "rewards/cargo_clippy_reward": 0.625, "rewards/cargo_test_reward": 0.25, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.359375, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 311.875, "completions/mean_terminated_length": 311.875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.15478615071283094, "grad_norm": 0.1238239866670562, "kl": 0.1663818359375, "learning_rate": 1e-06, "loss": 0.0046, "num_tokens": 10279469.0, "reward": 4.8671875, "reward_std": 0.8618788123130798, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.4375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.6796875, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 366.9375, "completions/mean_terminated_length": 366.9375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.15519348268839103, "grad_norm": 0.1566580288929037, "kl": 0.1141357421875, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 10306251.0, "reward": 5.0546875, "reward_std": 1.1590633392333984, "rewards/cargo_build_reward": 0.75, "rewards/cargo_clippy_reward": 0.75, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8046875, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 265.15625, "completions/mean_terminated_length": 265.15625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.15560081466395112, "grad_norm": 0.09000774004267857, "kl": 0.17578125, "learning_rate": 1e-06, "loss": 0.0041, "num_tokens": 10329976.0, "reward": 5.6953125, "reward_std": 0.6957299709320068, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.5625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8828125, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 313.34375, "completions/mean_terminated_length": 313.34375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.1560081466395112, "grad_norm": 0.11616919463486235, "kl": 0.1746826171875, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 10354731.0, "reward": 5.2421875, "reward_std": 0.9693155288696289, "rewards/cargo_build_reward": 0.75, "rewards/cargo_clippy_reward": 0.75, "rewards/cargo_test_reward": 0.4375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8671875, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 275.625, "completions/mean_terminated_length": 275.625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.1564154786150713, "grad_norm": 0.15165274236481607, "kl": 0.12646484375, "learning_rate": 1e-06, "loss": 0.003, "num_tokens": 10379231.0, "reward": 6.0390625, "reward_std": 1.0088050365447998, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.65625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9140625, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 271.1875, "completions/mean_terminated_length": 271.1875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.15682281059063136, "grad_norm": 0.15015585048817562, "kl": 0.1282958984375, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 10401717.0, "reward": 4.890625, "reward_std": 1.097530722618103, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.21875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.828125, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 248.375, "completions/mean_terminated_length": 248.375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.15723014256619144, "grad_norm": 0.09102946894059866, "kl": 0.208740234375, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 10423929.0, "reward": 5.921875, "reward_std": 0.9516997933387756, "rewards/cargo_build_reward": 1.0, "rewards/cargo_clippy_reward": 1.0, "rewards/cargo_test_reward": 0.53125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.859375, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 285.34375, "completions/mean_terminated_length": 285.34375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.15763747454175153, "grad_norm": 0.10176893664604732, "kl": 0.1142578125, "learning_rate": 1e-06, "loss": -0.0017, "num_tokens": 10448044.0, "reward": 5.53125, "reward_std": 0.8825888633728027, "rewards/cargo_build_reward": 0.9375, "rewards/cargo_clippy_reward": 0.9375, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.96875, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 358.09375, "completions/mean_terminated_length": 358.09375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.15804480651731162, "grad_norm": 0.09486203025489602, "kl": 0.2197265625, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 10475279.0, "reward": 5.3515625, "reward_std": 0.7889389395713806, "rewards/cargo_build_reward": 0.9375, "rewards/cargo_clippy_reward": 0.9375, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7265625, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 285.71875, "completions/mean_terminated_length": 285.71875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.15845213849287168, "grad_norm": 0.11315800904320061, "kl": 0.1761474609375, "learning_rate": 1e-06, "loss": -0.0062, "num_tokens": 10499462.0, "reward": 5.875, "reward_std": 0.8068819046020508, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.65625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.75, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 310.53125, "completions/mean_terminated_length": 310.53125, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.15885947046843177, "grad_norm": 0.116427112693145, "kl": 0.4306640625, "learning_rate": 1e-06, "loss": 0.0038, "num_tokens": 10524895.0, "reward": 5.0390625, "reward_std": 0.9798887968063354, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.78125, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8828125, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 801.0, "completions/max_terminated_length": 801.0, "completions/mean_length": 395.21875, "completions/mean_terminated_length": 395.21875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.15926680244399186, "grad_norm": 0.10921671461750218, "kl": 0.26708984375, "learning_rate": 1e-06, "loss": 0.0055, "num_tokens": 10552454.0, "reward": 5.21875, "reward_std": 0.7757423520088196, "rewards/cargo_build_reward": 0.9375, "rewards/cargo_clippy_reward": 0.9375, "rewards/cargo_test_reward": 0.1875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.96875, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 361.9375, "completions/mean_terminated_length": 361.9375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.15967413441955194, "grad_norm": 0.11276544935702329, "kl": 0.4576416015625, "learning_rate": 1e-06, "loss": -0.0028, "num_tokens": 10579828.0, "reward": 5.2265625, "reward_std": 0.9451630115509033, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8515625, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 350.40625, "completions/mean_terminated_length": 350.40625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.160081466395112, "grad_norm": 0.11822273106327352, "kl": 0.5804443359375, "learning_rate": 1e-06, "loss": -0.0028, "num_tokens": 10605409.0, "reward": 4.6953125, "reward_std": 0.9203661680221558, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.21875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9453125, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3345.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 357.40625, "completions/mean_terminated_length": 261.03225806451616, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.1604887983706721, "grad_norm": 0.10441250245049562, "kl": 0.18048095703125, "learning_rate": 1e-06, "loss": 0.1009, "num_tokens": 10632230.0, "reward": 6.1875, "reward_std": 0.6983473300933838, "rewards/cargo_build_reward": 0.96875, "rewards/cargo_clippy_reward": 0.96875, "rewards/cargo_test_reward": 0.75, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.8125, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 237.8125, "completions/mean_terminated_length": 237.8125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.16089613034623218, "grad_norm": 0.07926754950109575, "kl": 0.9730224609375, "learning_rate": 1e-06, "loss": -0.0021, "num_tokens": 10654432.0, "reward": 6.625, "reward_std": 0.3837546706199646, "rewards/cargo_build_reward": 1.0, "rewards/cargo_clippy_reward": 1.0, "rewards/cargo_test_reward": 0.9375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.75, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2061.0, "completions/max_terminated_length": 2061.0, "completions/mean_length": 370.84375, "completions/mean_terminated_length": 370.84375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.16130346232179227, "grad_norm": 0.1073240116584982, "kl": 0.2197265625, "learning_rate": 1e-06, "loss": -0.0018, "num_tokens": 10680451.0, "reward": 5.484375, "reward_std": 0.9157319068908691, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.921875, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 869.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 383.375, "completions/mean_terminated_length": 383.375, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.16171079429735236, "grad_norm": 0.09673030237234143, "kl": 0.6353759765625, "learning_rate": 1e-06, "loss": 0.0033, "num_tokens": 10707399.0, "reward": 5.1328125, "reward_std": 0.788161039352417, "rewards/cargo_build_reward": 0.96875, "rewards/cargo_clippy_reward": 0.96875, "rewards/cargo_test_reward": 0.15625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8828125, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 363.625, "completions/mean_terminated_length": 363.625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.16211812627291242, "grad_norm": 0.10485223516833546, "kl": 1.342529296875, "learning_rate": 1e-06, "loss": 0.0026, "num_tokens": 10734283.0, "reward": 5.046875, "reward_std": 0.567676842212677, "rewards/cargo_build_reward": 0.9375, "rewards/cargo_clippy_reward": 0.9375, "rewards/cargo_test_reward": 0.125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.921875, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 267.96875, "completions/mean_terminated_length": 267.96875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.1625254582484725, "grad_norm": 0.07787048703564636, "kl": 1.0731201171875, "learning_rate": 1e-06, "loss": -0.0014, "num_tokens": 10757706.0, "reward": 5.4609375, "reward_std": 0.2630031108856201, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.5, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7109375, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 474.03125, "completions/mean_terminated_length": 474.03125, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.1629327902240326, "grad_norm": 0.20822367287756927, "kl": 1.23486328125, "learning_rate": 1e-06, "loss": 0.0066, "num_tokens": 10787763.0, "reward": 4.6328125, "reward_std": 1.002000093460083, "rewards/cargo_build_reward": 0.75, "rewards/cargo_clippy_reward": 0.75, "rewards/cargo_test_reward": 0.25, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.6328125, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 899.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 393.78125, "completions/mean_terminated_length": 393.78125, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.16334012219959268, "grad_norm": 0.20937612914834222, "kl": 0.8896484375, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 10814844.0, "reward": 5.0546875, "reward_std": 0.8593862056732178, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.1875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9609375, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 691.0, "completions/max_terminated_length": 691.0, "completions/mean_length": 352.5, "completions/mean_terminated_length": 352.5, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.16374745417515274, "grad_norm": 0.1809280665979603, "kl": 0.12677001953125, "learning_rate": 1e-06, "loss": 0.0101, "num_tokens": 10841004.0, "reward": 5.7109375, "reward_std": 1.421525001525879, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.53125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8984375, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 782.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 301.5625, "completions/mean_terminated_length": 301.5625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.16415478615071283, "grad_norm": 0.09698607024115674, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 10865790.0, "reward": 5.515625, "reward_std": 0.7098821401596069, "rewards/cargo_build_reward": 0.9375, "rewards/cargo_clippy_reward": 0.9375, "rewards/cargo_test_reward": 0.53125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.578125, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 371.71875, "completions/mean_terminated_length": 371.71875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.16456211812627292, "grad_norm": 0.18732994989651297, "kl": 0.161865234375, "learning_rate": 1e-06, "loss": 0.0081, "num_tokens": 10892269.0, "reward": 4.21875, "reward_std": 1.2697362899780273, "rewards/cargo_build_reward": 0.59375, "rewards/cargo_clippy_reward": 0.59375, "rewards/cargo_test_reward": 0.21875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.625, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 350.3125, "completions/mean_terminated_length": 350.3125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.164969450101833, "grad_norm": 0.13301919129004566, "kl": 0.1346435546875, "learning_rate": 1e-06, "loss": 0.0039, "num_tokens": 10918631.0, "reward": 5.3515625, "reward_std": 0.9757429361343384, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9140625, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 313.75, "completions/mean_terminated_length": 313.75, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.16537678207739306, "grad_norm": 0.10968180274000976, "kl": 0.10076904296875, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 10943543.0, "reward": 5.0078125, "reward_std": 0.783983588218689, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.15625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9453125, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 303.9375, "completions/mean_terminated_length": 303.9375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.16578411405295315, "grad_norm": 0.08829224918003574, "kl": 0.0755615234375, "learning_rate": 1e-06, "loss": 0.0027, "num_tokens": 10968541.0, "reward": 5.359375, "reward_std": 0.68646639585495, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.921875, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1042.0, "completions/max_terminated_length": 1042.0, "completions/mean_length": 417.90625, "completions/mean_terminated_length": 417.90625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.16619144602851324, "grad_norm": 0.15879792392845696, "kl": 0.1019287109375, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 10996930.0, "reward": 4.5625, "reward_std": 0.992995023727417, "rewards/cargo_build_reward": 0.75, "rewards/cargo_clippy_reward": 0.75, "rewards/cargo_test_reward": 0.09375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.875, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 746.0, "completions/max_terminated_length": 746.0, "completions/mean_length": 350.5, "completions/mean_terminated_length": 350.5, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.16659877800407333, "grad_norm": 0.1618737948435576, "kl": 0.06365966796875, "learning_rate": 1e-06, "loss": 0.0128, "num_tokens": 11022642.0, "reward": 5.1171875, "reward_std": 1.2929661273956299, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7421875, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 370.21875, "completions/mean_terminated_length": 370.21875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.1670061099796334, "grad_norm": 0.1469536219914811, "kl": 0.0606689453125, "learning_rate": 1e-06, "loss": 0.0088, "num_tokens": 11049953.0, "reward": 5.5, "reward_std": 0.8628734350204468, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.5, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.84375, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 836.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 344.1875, "completions/mean_terminated_length": 355.2903225806452, "completions/min_length": 0.0, "completions/min_terminated_length": 223.0, "epoch": 0.16741344195519348, "grad_norm": 0.28361110669123324, "kl": 0.15478515625, "learning_rate": 1e-06, "loss": -0.0111, "num_tokens": 11079064.0, "reward": 4.9296875, "reward_std": 1.547074556350708, "rewards/cargo_build_reward": 0.78125, "rewards/cargo_clippy_reward": 0.78125, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 0.96875, "rewards/non_empty_reward": 0.96875, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.8671875, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 337.9375, "completions/mean_terminated_length": 337.9375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.16782077393075356, "grad_norm": 0.15166421938927865, "kl": 0.1522216796875, "learning_rate": 1e-06, "loss": 0.0051, "num_tokens": 11104446.0, "reward": 5.8046875, "reward_std": 0.7720569372177124, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7421875, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 329.96875, "completions/mean_terminated_length": 329.96875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.16822810590631365, "grad_norm": 0.06293129841573249, "kl": 0.1748046875, "learning_rate": 1e-06, "loss": -0.0012, "num_tokens": 11129573.0, "reward": 4.609375, "reward_std": 0.5376979112625122, "rewards/cargo_build_reward": 0.75, "rewards/cargo_clippy_reward": 0.75, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.546875, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 356.625, "completions/mean_terminated_length": 356.625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.16863543788187374, "grad_norm": 0.22461425052244266, "kl": 0.1195068359375, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 11156137.0, "reward": 4.640625, "reward_std": 1.0310025215148926, "rewards/cargo_build_reward": 0.5625, "rewards/cargo_clippy_reward": 0.5625, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.828125, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 273.875, "completions/mean_terminated_length": 273.875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.1690427698574338, "grad_norm": 0.08429891831942984, "kl": 0.1258544921875, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 11180173.0, "reward": 6.421875, "reward_std": 0.5966733694076538, "rewards/cargo_build_reward": 1.0, "rewards/cargo_clippy_reward": 1.0, "rewards/cargo_test_reward": 0.75, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.921875, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 276.09375, "completions/mean_terminated_length": 276.09375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.1694501018329939, "grad_norm": 0.1456370513489768, "kl": 0.41259765625, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 11203464.0, "reward": 6.0390625, "reward_std": 0.7834969758987427, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.6875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8515625, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 963.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 442.8125, "completions/mean_terminated_length": 442.8125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.16985743380855398, "grad_norm": 0.18604758304984106, "kl": 0.11004638671875, "learning_rate": 1e-06, "loss": 0.0198, "num_tokens": 11232954.0, "reward": 5.1953125, "reward_std": 1.1531951427459717, "rewards/cargo_build_reward": 0.78125, "rewards/cargo_clippy_reward": 0.78125, "rewards/cargo_test_reward": 0.40625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8203125, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 287.53125, "completions/mean_terminated_length": 287.53125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.17026476578411406, "grad_norm": 0.12467595783642056, "kl": 0.138916015625, "learning_rate": 1e-06, "loss": 0.002, "num_tokens": 11256659.0, "reward": 5.9453125, "reward_std": 1.1238775253295898, "rewards/cargo_build_reward": 0.96875, "rewards/cargo_clippy_reward": 0.96875, "rewards/cargo_test_reward": 0.59375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8203125, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 323.21875, "completions/mean_terminated_length": 323.21875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.17067209775967412, "grad_norm": 0.11434602722609279, "kl": 0.607177734375, "learning_rate": 1e-06, "loss": 0.0059, "num_tokens": 11282434.0, "reward": 4.59375, "reward_std": 0.6790916323661804, "rewards/cargo_build_reward": 0.6875, "rewards/cargo_clippy_reward": 0.6875, "rewards/cargo_test_reward": 0.3125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.59375, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 446.6875, "completions/mean_terminated_length": 446.6875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.1710794297352342, "grad_norm": 0.19793586572081812, "kl": 1.4931640625, "learning_rate": 1e-06, "loss": -0.0069, "num_tokens": 11312552.0, "reward": 4.3359375, "reward_std": 1.09117591381073, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7734375, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 380.65625, "completions/mean_terminated_length": 380.65625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.1714867617107943, "grad_norm": 0.10793400514470229, "kl": 0.2606201171875, "learning_rate": 1e-06, "loss": -0.0013, "num_tokens": 11339461.0, "reward": 5.28125, "reward_std": 0.6398772597312927, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.3125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.84375, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 297.5625, "completions/mean_terminated_length": 297.5625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.1718940936863544, "grad_norm": 0.19091039429941775, "kl": 0.3817138671875, "learning_rate": 1e-06, "loss": 0.005, "num_tokens": 11363895.0, "reward": 5.6796875, "reward_std": 1.2723278999328613, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.53125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9921875, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 344.625, "completions/mean_terminated_length": 344.625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.17230142566191445, "grad_norm": 0.11652509405044195, "kl": 0.5419921875, "learning_rate": 1e-06, "loss": 0.0026, "num_tokens": 11390211.0, "reward": 5.921875, "reward_std": 0.805986225605011, "rewards/cargo_build_reward": 0.96875, "rewards/cargo_clippy_reward": 0.96875, "rewards/cargo_test_reward": 0.53125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.921875, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 352.03125, "completions/mean_terminated_length": 352.03125, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.17270875763747454, "grad_norm": 0.0909377094601466, "kl": 0.56005859375, "learning_rate": 1e-06, "loss": 0.0021, "num_tokens": 11416860.0, "reward": 5.5703125, "reward_std": 0.6062199473381042, "rewards/cargo_build_reward": 0.96875, "rewards/cargo_clippy_reward": 0.96875, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8828125, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 478.84375, "completions/mean_terminated_length": 478.84375, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.17311608961303462, "grad_norm": 0.21006478717124752, "kl": 0.494140625, "learning_rate": 1e-06, "loss": 0.0077, "num_tokens": 11448007.0, "reward": 4.59375, "reward_std": 1.4197256565093994, "rewards/cargo_build_reward": 0.625, "rewards/cargo_clippy_reward": 0.625, "rewards/cargo_test_reward": 0.21875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.90625, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1124.0, "completions/max_terminated_length": 1124.0, "completions/mean_length": 328.3125, "completions/mean_terminated_length": 328.3125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.1735234215885947, "grad_norm": 0.17995730877645025, "kl": 0.34423828125, "learning_rate": 1e-06, "loss": 0.0238, "num_tokens": 11474065.0, "reward": 6.015625, "reward_std": 0.8857485055923462, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.6875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.828125, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 393.0625, "completions/mean_terminated_length": 393.0625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.1739307535641548, "grad_norm": 0.17681931663172054, "kl": 0.390625, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 11502435.0, "reward": 4.953125, "reward_std": 1.093381404876709, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.21875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.828125, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 362.84375, "completions/mean_terminated_length": 362.84375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.17433808553971486, "grad_norm": 0.19182910026101962, "kl": 1.423828125, "learning_rate": 1e-06, "loss": -0.0031, "num_tokens": 11529318.0, "reward": 5.234375, "reward_std": 1.2473208904266357, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.875, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 916.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 478.375, "completions/mean_terminated_length": 478.375, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.17474541751527495, "grad_norm": 0.10548379457160806, "kl": 0.30322265625, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 11558874.0, "reward": 4.453125, "reward_std": 0.6049275398254395, "rewards/cargo_build_reward": 0.75, "rewards/cargo_clippy_reward": 0.71875, "rewards/cargo_test_reward": 0.0625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.859375, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 349.75, "completions/mean_terminated_length": 349.75, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.17515274949083504, "grad_norm": 0.1807422808173608, "kl": 0.1004638671875, "learning_rate": 1e-06, "loss": 0.003, "num_tokens": 11585274.0, "reward": 5.421875, "reward_std": 1.433083415031433, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.78125, "rewards/cargo_test_reward": 0.4375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.890625, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 427.15625, "completions/mean_terminated_length": 427.15625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.17556008146639512, "grad_norm": 0.1610145168588778, "kl": 0.323974609375, "learning_rate": 1e-06, "loss": 0.0048, "num_tokens": 11613639.0, "reward": 5.03125, "reward_std": 1.1478092670440674, "rewards/cargo_build_reward": 0.65625, "rewards/cargo_clippy_reward": 0.65625, "rewards/cargo_test_reward": 0.46875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.78125, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 370.375, "completions/mean_terminated_length": 370.375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.17596741344195518, "grad_norm": 0.1301613036980156, "kl": 0.260009765625, "learning_rate": 1e-06, "loss": 0.0027, "num_tokens": 11639987.0, "reward": 5.2890625, "reward_std": 0.7868574857711792, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9140625, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 286.46875, "completions/mean_terminated_length": 286.46875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.17637474541751527, "grad_norm": 0.10529105951652248, "kl": 0.5634765625, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 11663602.0, "reward": 5.8984375, "reward_std": 0.6449092030525208, "rewards/cargo_build_reward": 0.9375, "rewards/cargo_clippy_reward": 0.9375, "rewards/cargo_test_reward": 0.5625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8984375, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 968.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 374.8125, "completions/mean_terminated_length": 374.8125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.17678207739307536, "grad_norm": 0.2267686821810141, "kl": 0.27734375, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 11690908.0, "reward": 4.703125, "reward_std": 1.112496256828308, "rewards/cargo_build_reward": 0.78125, "rewards/cargo_clippy_reward": 0.78125, "rewards/cargo_test_reward": 0.1875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.765625, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 361.375, "completions/mean_terminated_length": 361.375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.17718940936863545, "grad_norm": 0.15644434744266977, "kl": 0.305419921875, "learning_rate": 1e-06, "loss": 0.008, "num_tokens": 11717744.0, "reward": 5.2265625, "reward_std": 0.9309062957763672, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.46875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.6640625, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 980.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 517.21875, "completions/mean_terminated_length": 517.21875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.1775967413441955, "grad_norm": 0.21620849209453313, "kl": 0.273681640625, "learning_rate": 1e-06, "loss": -0.0021, "num_tokens": 11748863.0, "reward": 4.9375, "reward_std": 1.206173300743103, "rewards/cargo_build_reward": 0.75, "rewards/cargo_clippy_reward": 0.75, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.875, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 286.5, "completions/mean_terminated_length": 286.5, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.1780040733197556, "grad_norm": 0.11184396135570342, "kl": 0.29833984375, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 11772679.0, "reward": 5.0625, "reward_std": 0.7270735502243042, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.25, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.875, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 327.34375, "completions/mean_terminated_length": 327.34375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.17841140529531568, "grad_norm": 0.21751636937732396, "kl": 1.451171875, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 11798578.0, "reward": 5.84375, "reward_std": 1.2837473154067993, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.65625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.78125, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 405.09375, "completions/mean_terminated_length": 405.09375, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.17881873727087577, "grad_norm": 0.19611327561773098, "kl": 0.892333984375, "learning_rate": 1e-06, "loss": 0.0061, "num_tokens": 11826781.0, "reward": 4.59375, "reward_std": 1.0302538871765137, "rewards/cargo_build_reward": 0.78125, "rewards/cargo_clippy_reward": 0.78125, "rewards/cargo_test_reward": 0.09375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.84375, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 356.3125, "completions/mean_terminated_length": 356.3125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.17922606924643583, "grad_norm": 0.15175189330360042, "kl": 0.30322265625, "learning_rate": 1e-06, "loss": -0.0032, "num_tokens": 11853647.0, "reward": 5.71875, "reward_std": 0.9608312845230103, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.59375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8125, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 330.5, "completions/mean_terminated_length": 330.5, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.17963340122199592, "grad_norm": 0.23551920970756896, "kl": 0.458251953125, "learning_rate": 1e-06, "loss": -0.0033, "num_tokens": 11879143.0, "reward": 5.34375, "reward_std": 1.275673508644104, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.5625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.53125, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 341.96875, "completions/mean_terminated_length": 341.96875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.180040733197556, "grad_norm": 0.19668837839183034, "kl": 0.85009765625, "learning_rate": 1e-06, "loss": 0.0072, "num_tokens": 11904430.0, "reward": 5.828125, "reward_std": 1.0695008039474487, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.53125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.953125, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 294.125, "completions/mean_terminated_length": 294.125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.1804480651731161, "grad_norm": 0.15123640668333174, "kl": 0.57861328125, "learning_rate": 1e-06, "loss": -0.0018, "num_tokens": 11927770.0, "reward": 5.7109375, "reward_std": 0.8943933844566345, "rewards/cargo_build_reward": 1.0, "rewards/cargo_clippy_reward": 1.0, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9609375, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 355.1875, "completions/mean_terminated_length": 354.8709677419355, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.18085539714867618, "grad_norm": 0.3399968048336642, "kl": 3.849609375, "learning_rate": 1e-06, "loss": 0.0042, "num_tokens": 11954120.0, "reward": 5.6171875, "reward_std": 0.9756202101707458, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.4375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9296875, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 363.71875, "completions/mean_terminated_length": 363.71875, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.18126272912423624, "grad_norm": 0.0856631271266159, "kl": 0.323486328125, "learning_rate": 1e-06, "loss": 0.0025, "num_tokens": 11981247.0, "reward": 4.9140625, "reward_std": 0.4155312776565552, "rewards/cargo_build_reward": 0.96875, "rewards/cargo_clippy_reward": 0.96875, "rewards/cargo_test_reward": 0.03125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9140625, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 303.09375, "completions/mean_terminated_length": 303.09375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.18167006109979633, "grad_norm": 0.5581568917702728, "kl": 6.1220703125, "learning_rate": 1e-06, "loss": 0.005, "num_tokens": 12005490.0, "reward": 5.9765625, "reward_std": 0.9034623503684998, "rewards/cargo_build_reward": 0.96875, "rewards/cargo_clippy_reward": 0.96875, "rewards/cargo_test_reward": 0.625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7890625, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 776.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 400.25, "completions/mean_terminated_length": 400.25, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.18207739307535642, "grad_norm": 0.16536634201006845, "kl": 0.5810546875, "learning_rate": 1e-06, "loss": 0.0013, "num_tokens": 12033810.0, "reward": 5.2421875, "reward_std": 0.9611786603927612, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.6796875, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 269.21875, "completions/mean_terminated_length": 269.21875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.1824847250509165, "grad_norm": 0.12243037217908188, "kl": 0.651611328125, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 12057809.0, "reward": 6.421875, "reward_std": 0.992490291595459, "rewards/cargo_build_reward": 0.9375, "rewards/cargo_clippy_reward": 0.9375, "rewards/cargo_test_reward": 0.84375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.859375, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 420.0, "completions/mean_terminated_length": 420.0, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.18289205702647657, "grad_norm": 0.1586099231207866, "kl": 1.142578125, "learning_rate": 1e-06, "loss": 0.004, "num_tokens": 12085865.0, "reward": 4.5859375, "reward_std": 0.9468885064125061, "rewards/cargo_build_reward": 0.71875, "rewards/cargo_clippy_reward": 0.71875, "rewards/cargo_test_reward": 0.1875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7734375, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 361.375, "completions/mean_terminated_length": 361.375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.18329938900203666, "grad_norm": 0.19762038686403974, "kl": 0.309326171875, "learning_rate": 1e-06, "loss": 0.0051, "num_tokens": 12112197.0, "reward": 5.2890625, "reward_std": 1.0431478023529053, "rewards/cargo_build_reward": 0.78125, "rewards/cargo_clippy_reward": 0.78125, "rewards/cargo_test_reward": 0.40625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.9296875, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 778.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 398.46875, "completions/mean_terminated_length": 398.46875, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.18370672097759674, "grad_norm": 0.11791037915643654, "kl": 0.3271484375, "learning_rate": 1e-06, "loss": -0.0041, "num_tokens": 12139140.0, "reward": 5.453125, "reward_std": 0.7159276008605957, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.40625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.953125, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 387.59375, "completions/mean_terminated_length": 387.59375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.18411405295315683, "grad_norm": 0.19748641854512347, "kl": 1.38232421875, "learning_rate": 1e-06, "loss": 0.0083, "num_tokens": 12166903.0, "reward": 4.7109375, "reward_std": 0.9857768416404724, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.09375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8984375, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 734.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 370.8125, "completions/mean_terminated_length": 395.53333333333336, "completions/min_length": 0.0, "completions/min_terminated_length": 242.0, "epoch": 0.1845213849287169, "grad_norm": 0.26793835232040064, "kl": 1.05419921875, "learning_rate": 1e-06, "loss": -0.0252, "num_tokens": 12201155.0, "reward": 4.921875, "reward_std": 1.447327971458435, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.25, "rewards/code_block_count_reward": 0.9375, "rewards/non_empty_reward": 0.9375, "rewards/test_block_count_reward": 0.9375, "rewards/tests_have_asserts_reward": 0.921875, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 258.46875, "completions/mean_terminated_length": 258.46875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.18492871690427698, "grad_norm": 0.11951130654168295, "kl": 0.2705078125, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 12224282.0, "reward": 5.2109375, "reward_std": 0.8302067518234253, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.6484375, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 350.15625, "completions/mean_terminated_length": 350.15625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.18533604887983707, "grad_norm": 0.22151210736583518, "kl": 1.805908203125, "learning_rate": 1e-06, "loss": 0.0042, "num_tokens": 12250799.0, "reward": 5.8125, "reward_std": 0.5101194381713867, "rewards/cargo_build_reward": 0.9375, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.65625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.65625, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 872.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 415.0, "completions/mean_terminated_length": 415.0, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.18574338085539716, "grad_norm": 0.19235112400930446, "kl": 0.5244140625, "learning_rate": 1e-06, "loss": -0.0024, "num_tokens": 12279199.0, "reward": 5.2109375, "reward_std": 1.1287403106689453, "rewards/cargo_build_reward": 0.75, "rewards/cargo_clippy_reward": 0.75, "rewards/cargo_test_reward": 0.40625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8984375, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 324.96875, "completions/mean_terminated_length": 324.96875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.18615071283095724, "grad_norm": 0.2618561387504406, "kl": 2.25390625, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 12304854.0, "reward": 5.796875, "reward_std": 0.9935402870178223, "rewards/cargo_build_reward": 0.9375, "rewards/cargo_clippy_reward": 0.9375, "rewards/cargo_test_reward": 0.53125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.859375, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 926.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 327.0, "completions/mean_terminated_length": 327.0, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.1865580448065173, "grad_norm": 0.14057633470756206, "kl": 1.26025390625, "learning_rate": 1e-06, "loss": 0.0038, "num_tokens": 12330454.0, "reward": 5.609375, "reward_std": 0.8704342246055603, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.5, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.796875, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 380.0, "completions/mean_terminated_length": 380.0, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.1869653767820774, "grad_norm": 0.14731921461654593, "kl": 0.5023193359375, "learning_rate": 1e-06, "loss": 0.0037, "num_tokens": 12358278.0, "reward": 4.8984375, "reward_std": 0.999342679977417, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.1875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8984375, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 408.28125, "completions/mean_terminated_length": 408.28125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.18737270875763748, "grad_norm": 0.15995704734078894, "kl": 0.92578125, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 12386247.0, "reward": 4.609375, "reward_std": 1.1562089920043945, "rewards/cargo_build_reward": 0.6875, "rewards/cargo_clippy_reward": 0.6875, "rewards/cargo_test_reward": 0.1875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.859375, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 356.5625, "completions/mean_terminated_length": 356.5625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.18778004073319757, "grad_norm": 0.20457059650057555, "kl": 0.61572265625, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 12412649.0, "reward": 5.5625, "reward_std": 0.7501891851425171, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.5, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9375, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1093.0, "completions/max_terminated_length": 1093.0, "completions/mean_length": 396.40625, "completions/mean_terminated_length": 396.40625, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.18818737270875763, "grad_norm": 0.1654841583796105, "kl": 0.4827880859375, "learning_rate": 1e-06, "loss": 0.0085, "num_tokens": 12441142.0, "reward": 4.703125, "reward_std": 1.057011604309082, "rewards/cargo_build_reward": 0.78125, "rewards/cargo_clippy_reward": 0.75, "rewards/cargo_test_reward": 0.09375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.984375, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1373.0, "completions/max_terminated_length": 1373.0, "completions/mean_length": 528.75, "completions/mean_terminated_length": 528.75, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.18859470468431772, "grad_norm": 0.33716659828305967, "kl": 0.581298828125, "learning_rate": 1e-06, "loss": 0.022, "num_tokens": 12473190.0, "reward": 5.0234375, "reward_std": 1.5583802461624146, "rewards/cargo_build_reward": 0.6875, "rewards/cargo_clippy_reward": 0.6875, "rewards/cargo_test_reward": 0.40625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8359375, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 303.375, "completions/mean_terminated_length": 303.375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.1890020366598778, "grad_norm": 0.14091463041705182, "kl": 0.2255859375, "learning_rate": 1e-06, "loss": -0.0017, "num_tokens": 12497322.0, "reward": 5.2734375, "reward_std": 1.10199773311615, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8984375, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 861.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 451.03125, "completions/mean_terminated_length": 451.03125, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.1894093686354379, "grad_norm": 0.12767800369089213, "kl": 0.304931640625, "learning_rate": 1e-06, "loss": -0.0029, "num_tokens": 12527443.0, "reward": 5.03125, "reward_std": 0.7347978353500366, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.3125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.65625, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 325.25, "completions/mean_terminated_length": 325.25, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.18981670061099795, "grad_norm": 0.05474815313000358, "kl": 0.1480712890625, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 12553059.0, "reward": 6.171875, "reward_std": 0.4042172133922577, "rewards/cargo_build_reward": 1.0, "rewards/cargo_clippy_reward": 1.0, "rewards/cargo_test_reward": 0.625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.921875, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 281.125, "completions/mean_terminated_length": 281.125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.19022403258655804, "grad_norm": 0.11779257666699224, "kl": 0.45068359375, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 12576679.0, "reward": 5.1171875, "reward_std": 0.5848700404167175, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7421875, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 326.6875, "completions/mean_terminated_length": 326.6875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.19063136456211813, "grad_norm": 0.1447683885773122, "kl": 0.330810546875, "learning_rate": 1e-06, "loss": 0.0031, "num_tokens": 12602325.0, "reward": 5.078125, "reward_std": 0.9557596445083618, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.3125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.828125, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 369.3125, "completions/mean_terminated_length": 369.3125, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.19103869653767822, "grad_norm": 0.1527553488628639, "kl": 0.35943603515625, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 12629103.0, "reward": 5.2578125, "reward_std": 0.8825398087501526, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.25, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9453125, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 346.625, "completions/mean_terminated_length": 346.625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.19144602851323828, "grad_norm": 0.1369293138745396, "kl": 0.5859375, "learning_rate": 1e-06, "loss": -0.0088, "num_tokens": 12654891.0, "reward": 5.9375, "reward_std": 0.761785089969635, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.5625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 1.0, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 908.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 476.09375, "completions/mean_terminated_length": 476.09375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.19185336048879836, "grad_norm": 0.16020188437510458, "kl": 0.2183837890625, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 12685414.0, "reward": 5.046875, "reward_std": 1.0090612173080444, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.25, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.921875, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 333.5, "completions/mean_terminated_length": 333.5, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.19226069246435845, "grad_norm": 0.18716380062978372, "kl": 0.240966796875, "learning_rate": 1e-06, "loss": -0.0048, "num_tokens": 12711390.0, "reward": 5.5625, "reward_std": 1.2912083864212036, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.46875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.875, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 843.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 316.5625, "completions/mean_terminated_length": 316.5625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.19266802443991854, "grad_norm": 0.12726416523937797, "kl": 0.510986328125, "learning_rate": 1e-06, "loss": 0.0031, "num_tokens": 12736760.0, "reward": 5.75, "reward_std": 0.4646196663379669, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.53125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9375, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 322.0625, "completions/mean_terminated_length": 322.0625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.19307535641547863, "grad_norm": 0.19139405503932053, "kl": 0.6455078125, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 12762074.0, "reward": 5.3671875, "reward_std": 1.0574750900268555, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.4375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8359375, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 797.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 413.1875, "completions/mean_terminated_length": 413.1875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.1934826883910387, "grad_norm": 0.20173974979643558, "kl": 0.47998046875, "learning_rate": 1e-06, "loss": 0.008, "num_tokens": 12790776.0, "reward": 4.328125, "reward_std": 1.2768642902374268, "rewards/cargo_build_reward": 0.59375, "rewards/cargo_clippy_reward": 0.59375, "rewards/cargo_test_reward": 0.125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.890625, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1528.0, "completions/max_terminated_length": 1528.0, "completions/mean_length": 471.15625, "completions/mean_terminated_length": 471.15625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.19389002036659878, "grad_norm": 0.19938856692223425, "kl": 0.365478515625, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 12821157.0, "reward": 4.6640625, "reward_std": 1.1157841682434082, "rewards/cargo_build_reward": 0.71875, "rewards/cargo_clippy_reward": 0.71875, "rewards/cargo_test_reward": 0.15625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9140625, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 302.9375, "completions/mean_terminated_length": 302.9375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.19429735234215886, "grad_norm": 0.21634659107954318, "kl": 0.4873046875, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 12846059.0, "reward": 6.2109375, "reward_std": 0.763170599937439, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.8125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7734375, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 296.46875, "completions/mean_terminated_length": 295.93548387096774, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.19470468431771895, "grad_norm": 0.15580036522512727, "kl": 1.927978515625, "learning_rate": 1e-06, "loss": 0.0022, "num_tokens": 12870202.0, "reward": 5.8203125, "reward_std": 1.0648064613342285, "rewards/cargo_build_reward": 0.96875, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.5, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9453125, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 335.0625, "completions/mean_terminated_length": 335.0625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.195112016293279, "grad_norm": 0.14490128955032502, "kl": 0.397216796875, "learning_rate": 1e-06, "loss": -0.0028, "num_tokens": 12895668.0, "reward": 5.625, "reward_std": 0.9494472742080688, "rewards/cargo_build_reward": 0.9375, "rewards/cargo_clippy_reward": 0.9375, "rewards/cargo_test_reward": 0.4375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.875, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1056.0, "completions/max_terminated_length": 1056.0, "completions/mean_length": 481.28125, "completions/mean_terminated_length": 481.28125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.1955193482688391, "grad_norm": 0.17211256383646323, "kl": 1.50634765625, "learning_rate": 1e-06, "loss": 0.003, "num_tokens": 12926229.0, "reward": 5.1640625, "reward_std": 0.5368491411209106, "rewards/cargo_build_reward": 0.625, "rewards/cargo_clippy_reward": 0.625, "rewards/cargo_test_reward": 0.5, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9140625, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 892.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 445.125, "completions/mean_terminated_length": 445.125, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.1959266802443992, "grad_norm": 1.266372236336311, "kl": 6.962890625, "learning_rate": 1e-06, "loss": 0.0156, "num_tokens": 12956073.0, "reward": 5.34375, "reward_std": 1.0504249334335327, "rewards/cargo_build_reward": 0.9375, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.75, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 278.125, "completions/mean_terminated_length": 278.125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.19633401221995928, "grad_norm": 0.2264456989202532, "kl": 1.4228515625, "learning_rate": 1e-06, "loss": -0.0021, "num_tokens": 12980477.0, "reward": 5.265625, "reward_std": 1.2569725513458252, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.828125, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 289.28125, "completions/mean_terminated_length": 289.28125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.19674134419551934, "grad_norm": 0.1574338951932043, "kl": 0.398681640625, "learning_rate": 1e-06, "loss": -0.0012, "num_tokens": 13004566.0, "reward": 6.3125, "reward_std": 0.7749233841896057, "rewards/cargo_build_reward": 0.96875, "rewards/cargo_clippy_reward": 0.96875, "rewards/cargo_test_reward": 0.71875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9375, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 895.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 391.21875, "completions/mean_terminated_length": 391.21875, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.19714867617107942, "grad_norm": 0.22265607253657393, "kl": 1.1279296875, "learning_rate": 1e-06, "loss": 0.0065, "num_tokens": 13032013.0, "reward": 5.5234375, "reward_std": 1.4083189964294434, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.53125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.7109375, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 329.53125, "completions/mean_terminated_length": 329.53125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.1975560081466395, "grad_norm": 0.16963911268826184, "kl": 0.841796875, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 13058102.0, "reward": 5.75, "reward_std": 1.0953752994537354, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.875, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/max_terminated_length": 596.0, "completions/mean_length": 364.8125, "completions/mean_terminated_length": 364.8125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.1979633401221996, "grad_norm": 0.1374298967851221, "kl": 0.864501953125, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 13085072.0, "reward": 5.421875, "reward_std": 0.736834704875946, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.921875, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 298.21875, "completions/mean_terminated_length": 298.21875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.1983706720977597, "grad_norm": 0.1261195087623572, "kl": 0.7607421875, "learning_rate": 1e-06, "loss": 0.0053, "num_tokens": 13109911.0, "reward": 5.546875, "reward_std": 0.8305887579917908, "rewards/cargo_build_reward": 0.96875, "rewards/cargo_clippy_reward": 0.96875, "rewards/cargo_test_reward": 0.375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.859375, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 310.96875, "completions/mean_terminated_length": 310.96875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.19877800407331975, "grad_norm": 0.20554491217342194, "kl": 0.95458984375, "learning_rate": 1e-06, "loss": 0.0038, "num_tokens": 13134190.0, "reward": 5.5390625, "reward_std": 0.9816341996192932, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.4375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9140625, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 266.375, "completions/mean_terminated_length": 266.375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.19918533604887984, "grad_norm": 0.13393542281782445, "kl": 0.2066650390625, "learning_rate": 1e-06, "loss": -0.0014, "num_tokens": 13157778.0, "reward": 6.2578125, "reward_std": 0.9760429263114929, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.75, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9453125, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 287.0625, "completions/mean_terminated_length": 287.0625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.19959266802443992, "grad_norm": 0.14381180965090343, "kl": 0.569091796875, "learning_rate": 1e-06, "loss": 0.0085, "num_tokens": 13182436.0, "reward": 6.1875, "reward_std": 0.6123279333114624, "rewards/cargo_build_reward": 0.90625, "rewards/cargo_clippy_reward": 0.90625, "rewards/cargo_test_reward": 0.6875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 1.0, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 372.40625, "completions/mean_terminated_length": 372.40625, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.2, "grad_norm": 0.20813935210236095, "kl": 0.9661865234375, "learning_rate": 1e-06, "loss": 0.0042, "num_tokens": 13209961.0, "reward": 5.609375, "reward_std": 1.2343088388442993, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.53125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.859375, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 277.53125, "completions/mean_terminated_length": 277.53125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.20040733197556007, "grad_norm": 0.18185843577224053, "kl": 1.46142578125, "learning_rate": 1e-06, "loss": -0.0021, "num_tokens": 13233450.0, "reward": 6.0859375, "reward_std": 0.696807861328125, "rewards/cargo_build_reward": 1.0, "rewards/cargo_clippy_reward": 1.0, "rewards/cargo_test_reward": 0.59375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8984375, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 360.34375, "completions/mean_terminated_length": 360.34375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.20081466395112016, "grad_norm": 0.18184364860984456, "kl": 2.14453125, "learning_rate": 1e-06, "loss": 0.0046, "num_tokens": 13260229.0, "reward": 6.1171875, "reward_std": 0.9113576412200928, "rewards/cargo_build_reward": 0.96875, "rewards/cargo_clippy_reward": 0.96875, "rewards/cargo_test_reward": 0.6875, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.8046875, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 748.0, "completions/max_terminated_length": 748.0, "completions/mean_length": 430.21875, "completions/mean_terminated_length": 430.21875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.20122199592668025, "grad_norm": 0.3999569580272008, "kl": 3.3974609375, "learning_rate": 1e-06, "loss": -0.0024, "num_tokens": 13289452.0, "reward": 5.421875, "reward_std": 1.0172343254089355, "rewards/cargo_build_reward": 0.84375, "rewards/cargo_clippy_reward": 0.84375, "rewards/cargo_test_reward": 0.40625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.9375, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 318.15625, "completions/mean_terminated_length": 318.15625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.20162932790224034, "grad_norm": 0.18334239173462724, "kl": 0.400146484375, "learning_rate": 1e-06, "loss": 0.0032, "num_tokens": 13314457.0, "reward": 6.0, "reward_std": 1.0162653923034668, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 1.0, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 319.96875, "completions/mean_terminated_length": 319.96875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.2020366598778004, "grad_norm": 0.45914975101690647, "kl": 3.718505859375, "learning_rate": 1e-06, "loss": -0.0052, "num_tokens": 13339552.0, "reward": 5.1484375, "reward_std": 1.227099895477295, "rewards/cargo_build_reward": 0.8125, "rewards/cargo_clippy_reward": 0.8125, "rewards/cargo_test_reward": 0.34375, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 0.96875, "rewards/tests_have_asserts_reward": 0.8515625, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 265.78125, "completions/mean_terminated_length": 265.78125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.20244399185336048, "grad_norm": 0.1617705252962688, "kl": 2.36474609375, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 13362441.0, "reward": 6.2109375, "reward_std": 0.4671742618083954, "rewards/cargo_build_reward": 1.0, "rewards/cargo_clippy_reward": 1.0, "rewards/cargo_test_reward": 0.625, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.9609375, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 211.625, "completions/mean_terminated_length": 211.625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.20285132382892057, "grad_norm": 0.05636492303275045, "kl": 0.39111328125, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 13382925.0, "reward": 6.2578125, "reward_std": 0.4500587284564972, "rewards/cargo_build_reward": 1.0, "rewards/cargo_clippy_reward": 1.0, "rewards/cargo_test_reward": 0.78125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.6953125, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 371.53125, "completions/mean_terminated_length": 371.53125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.20325865580448066, "grad_norm": 0.2578517900980799, "kl": 1.40234375, "learning_rate": 1e-06, "loss": 0.0099, "num_tokens": 13409670.0, "reward": 4.921875, "reward_std": 1.2841131687164307, "rewards/cargo_build_reward": 0.75, "rewards/cargo_clippy_reward": 0.75, "rewards/cargo_test_reward": 0.28125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.859375, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 366.71875, "completions/mean_terminated_length": 366.71875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.20366598778004075, "grad_norm": 0.22751045945985918, "kl": 0.599609375, "learning_rate": 1e-06, "loss": 0.0091, "num_tokens": 13435773.0, "reward": 5.546875, "reward_std": 1.4063599109649658, "rewards/cargo_build_reward": 0.875, "rewards/cargo_clippy_reward": 0.875, "rewards/cargo_test_reward": 0.53125, "rewards/code_block_count_reward": 1.0, "rewards/non_empty_reward": 1.0, "rewards/test_block_count_reward": 1.0, "rewards/tests_have_asserts_reward": 0.734375, "step": 500 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 13435773, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }