{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 3.0,
  "eval_steps": 500,
  "global_step": 255,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 378.0,
      "completions/max_terminated_length": 378.0,
      "completions/mean_length": 116.875,
      "completions/mean_terminated_length": 116.875,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "entropy": 0.3960496634244919,
      "epoch": 0.011764705882352941,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3658151626586914,
      "learning_rate": 1e-06,
      "loss": 0.0,
      "num_tokens": 120704.0,
      "reward": 0.42291906476020813,
      "reward_std": 0.353160560131073,
      "rewards/rna_reward_fn/mean": 0.42291906476020813,
      "rewards/rna_reward_fn/std": 0.39480823278427124,
      "step": 1
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 145.34375,
      "completions/mean_terminated_length": 145.34375,
      "completions/min_length": 32.0,
      "completions/min_terminated_length": 32.0,
      "entropy": 0.3918581157922745,
      "epoch": 0.023529411764705882,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3561055362224579,
      "learning_rate": 9.96078431372549e-07,
      "loss": 0.0,
      "num_tokens": 270560.0,
      "reward": 0.4679465889930725,
      "reward_std": 0.304127037525177,
      "rewards/rna_reward_fn/mean": 0.4679465889930725,
      "rewards/rna_reward_fn/std": 0.37357842922210693,
      "step": 2
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 169.4375,
      "completions/mean_terminated_length": 169.4375,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "entropy": 0.3528731167316437,
      "epoch": 0.03529411764705882,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3573973476886749,
      "learning_rate": 9.92156862745098e-07,
      "loss": 0.0,
      "num_tokens": 445088.0,
      "reward": 0.4688035249710083,
      "reward_std": 0.3215726613998413,
      "rewards/rna_reward_fn/mean": 0.4688035249710083,
      "rewards/rna_reward_fn/std": 0.3945569097995758,
      "step": 3
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 164.53125,
      "completions/mean_terminated_length": 164.53125,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "entropy": 0.3565346747636795,
      "epoch": 0.047058823529411764,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.37075310945510864,
      "learning_rate": 9.88235294117647e-07,
      "loss": -0.0,
      "num_tokens": 614592.0,
      "reward": 0.5333437323570251,
      "reward_std": 0.3202625513076782,
      "rewards/rna_reward_fn/mean": 0.5333437323570251,
      "rewards/rna_reward_fn/std": 0.3746815025806427,
      "step": 4
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 400.0,
      "completions/max_terminated_length": 400.0,
      "completions/mean_length": 103.3125,
      "completions/mean_terminated_length": 103.3125,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "entropy": 0.35146908462047577,
      "epoch": 0.058823529411764705,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.34449008107185364,
      "learning_rate": 9.84313725490196e-07,
      "loss": -0.0,
      "num_tokens": 721408.0,
      "reward": 0.5266900062561035,
      "reward_std": 0.32159364223480225,
      "rewards/rna_reward_fn/mean": 0.5266900062561035,
      "rewards/rna_reward_fn/std": 0.3701845705509186,
      "step": 5
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 161.25,
      "completions/mean_terminated_length": 161.25,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "entropy": 0.3309106081724167,
      "epoch": 0.07058823529411765,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.35763484239578247,
      "learning_rate": 9.80392156862745e-07,
      "loss": -0.0,
      "num_tokens": 887552.0,
      "reward": 0.5357265472412109,
      "reward_std": 0.2797412872314453,
      "rewards/rna_reward_fn/mean": 0.5357265472412109,
      "rewards/rna_reward_fn/std": 0.3577335476875305,
      "step": 6
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 151.375,
      "completions/mean_terminated_length": 151.375,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "entropy": 0.34717176854610443,
      "epoch": 0.08235294117647059,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3663802146911621,
      "learning_rate": 9.76470588235294e-07,
      "loss": -0.0,
      "num_tokens": 1043584.0,
      "reward": 0.547458291053772,
      "reward_std": 0.2995288372039795,
      "rewards/rna_reward_fn/mean": 0.547458291053772,
      "rewards/rna_reward_fn/std": 0.3604092001914978,
      "step": 7
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 482.0,
      "completions/max_terminated_length": 482.0,
      "completions/mean_length": 167.125,
      "completions/mean_terminated_length": 167.125,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "entropy": 0.31340789794921875,
      "epoch": 0.09411764705882353,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4071066081523895,
      "learning_rate": 9.725490196078432e-07,
      "loss": -0.0,
      "num_tokens": 1215744.0,
      "reward": 0.5176310539245605,
      "reward_std": 0.3205966353416443,
      "rewards/rna_reward_fn/mean": 0.5176310539245605,
      "rewards/rna_reward_fn/std": 0.3642078638076782,
      "step": 8
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 151.59375,
      "completions/mean_terminated_length": 151.59375,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "entropy": 0.305365189909935,
      "epoch": 0.10588235294117647,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3989139795303345,
      "learning_rate": 9.686274509803921e-07,
      "loss": -0.0,
      "num_tokens": 1372000.0,
      "reward": 0.6008568406105042,
      "reward_std": 0.30818045139312744,
      "rewards/rna_reward_fn/mean": 0.6008569002151489,
      "rewards/rna_reward_fn/std": 0.35290631651878357,
      "step": 9
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 135.53125,
      "completions/mean_terminated_length": 135.53125,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "entropy": 0.2962174266576767,
      "epoch": 0.11764705882352941,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.44642144441604614,
      "learning_rate": 9.64705882352941e-07,
      "loss": 0.0,
      "num_tokens": 1511808.0,
      "reward": 0.540717601776123,
      "reward_std": 0.3060719966888428,
      "rewards/rna_reward_fn/mean": 0.540717601776123,
      "rewards/rna_reward_fn/std": 0.36574023962020874,
      "step": 10
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 187.71875,
      "completions/mean_terminated_length": 187.71875,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "entropy": 0.2934599667787552,
      "epoch": 0.12941176470588237,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3814420700073242,
      "learning_rate": 9.607843137254902e-07,
      "loss": -0.0,
      "num_tokens": 1705056.0,
      "reward": 0.6084277629852295,
      "reward_std": 0.3016743063926697,
      "rewards/rna_reward_fn/mean": 0.6084277629852295,
      "rewards/rna_reward_fn/std": 0.37008586525917053,
      "step": 11
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 123.65625,
      "completions/mean_terminated_length": 123.65625,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "entropy": 0.28613443672657013,
      "epoch": 0.1411764705882353,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.34958702325820923,
      "learning_rate": 9.568627450980392e-07,
      "loss": 0.0,
      "num_tokens": 1832704.0,
      "reward": 0.6017879247665405,
      "reward_std": 0.3006741404533386,
      "rewards/rna_reward_fn/mean": 0.6017879247665405,
      "rewards/rna_reward_fn/std": 0.35490649938583374,
      "step": 12
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 140.65625,
      "completions/mean_terminated_length": 140.65625,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "entropy": 0.277506560087204,
      "epoch": 0.15294117647058825,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5352854132652283,
      "learning_rate": 9.529411764705881e-07,
      "loss": 0.0,
      "num_tokens": 1977760.0,
      "reward": 0.571915328502655,
      "reward_std": 0.2985040843486786,
      "rewards/rna_reward_fn/mean": 0.5719153881072998,
      "rewards/rna_reward_fn/std": 0.3767135441303253,
      "step": 13
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 424.0,
      "completions/max_terminated_length": 424.0,
      "completions/mean_length": 154.03125,
      "completions/mean_terminated_length": 154.03125,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "entropy": 0.2907712608575821,
      "epoch": 0.16470588235294117,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.40935981273651123,
      "learning_rate": 9.490196078431371e-07,
      "loss": 0.0,
      "num_tokens": 2136512.0,
      "reward": 0.5937778353691101,
      "reward_std": 0.270163893699646,
      "rewards/rna_reward_fn/mean": 0.5937778353691101,
      "rewards/rna_reward_fn/std": 0.3509018123149872,
      "step": 14
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 184.40625,
      "completions/mean_terminated_length": 184.40625,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "entropy": 0.27846619486808777,
      "epoch": 0.17647058823529413,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.41769424080848694,
      "learning_rate": 9.450980392156862e-07,
      "loss": 0.0,
      "num_tokens": 2326368.0,
      "reward": 0.6163018941879272,
      "reward_std": 0.26538053154945374,
      "rewards/rna_reward_fn/mean": 0.6163018941879272,
      "rewards/rna_reward_fn/std": 0.3496814966201782,
      "step": 15
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 389.0,
      "completions/max_terminated_length": 389.0,
      "completions/mean_length": 117.84375,
      "completions/mean_terminated_length": 117.84375,
      "completions/min_length": 36.0,
      "completions/min_terminated_length": 36.0,
      "entropy": 0.2604786157608032,
      "epoch": 0.18823529411764706,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3845226764678955,
      "learning_rate": 9.411764705882352e-07,
      "loss": 0.0,
      "num_tokens": 2448064.0,
      "reward": 0.5925071239471436,
      "reward_std": 0.2943580150604248,
      "rewards/rna_reward_fn/mean": 0.5925071239471436,
      "rewards/rna_reward_fn/std": 0.3674796521663666,
      "step": 16
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 112.125,
      "completions/mean_terminated_length": 112.125,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "entropy": 0.25712524354457855,
      "epoch": 0.2,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.39543959498405457,
      "learning_rate": 9.372549019607843e-07,
      "loss": -0.0,
      "num_tokens": 2563904.0,
      "reward": 0.5904660224914551,
      "reward_std": 0.26803961396217346,
      "rewards/rna_reward_fn/mean": 0.5904660224914551,
      "rewards/rna_reward_fn/std": 0.3583122193813324,
      "step": 17
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 138.40625,
      "completions/mean_terminated_length": 138.40625,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "entropy": 0.27494488656520844,
      "epoch": 0.21176470588235294,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.32191383838653564,
      "learning_rate": 9.333333333333333e-07,
      "loss": -0.0,
      "num_tokens": 2706656.0,
      "reward": 0.6467701196670532,
      "reward_std": 0.2634694576263428,
      "rewards/rna_reward_fn/mean": 0.6467701196670532,
      "rewards/rna_reward_fn/std": 0.3313148319721222,
      "step": 18
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 137.6875,
      "completions/mean_terminated_length": 137.6875,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "entropy": 0.260918065905571,
      "epoch": 0.2235294117647059,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4905475378036499,
      "learning_rate": 9.294117647058824e-07,
      "loss": 0.0,
      "num_tokens": 2848672.0,
      "reward": 0.5871793031692505,
      "reward_std": 0.25154006481170654,
      "rewards/rna_reward_fn/mean": 0.5871793031692505,
      "rewards/rna_reward_fn/std": 0.3587729334831238,
      "step": 19
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 166.78125,
      "completions/mean_terminated_length": 166.78125,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "entropy": 0.26801037788391113,
      "epoch": 0.23529411764705882,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.7330372929573059,
      "learning_rate": 9.254901960784314e-07,
      "loss": -0.0,
      "num_tokens": 3020480.0,
      "reward": 0.5460379123687744,
      "reward_std": 0.27695512771606445,
      "rewards/rna_reward_fn/mean": 0.5460379123687744,
      "rewards/rna_reward_fn/std": 0.37495046854019165,
      "step": 20
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 142.6875,
      "completions/mean_terminated_length": 142.6875,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "entropy": 0.26508544385433197,
      "epoch": 0.24705882352941178,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4575193524360657,
      "learning_rate": 9.215686274509803e-07,
      "loss": 0.0,
      "num_tokens": 3167616.0,
      "reward": 0.6192805171012878,
      "reward_std": 0.2736813426017761,
      "rewards/rna_reward_fn/mean": 0.6192805171012878,
      "rewards/rna_reward_fn/std": 0.3539046049118042,
      "step": 21
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 154.25,
      "completions/mean_terminated_length": 154.25,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "entropy": 0.25467583537101746,
      "epoch": 0.25882352941176473,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.407436341047287,
      "learning_rate": 9.176470588235293e-07,
      "loss": 0.0,
      "num_tokens": 3326592.0,
      "reward": 0.5778753757476807,
      "reward_std": 0.27449485659599304,
      "rewards/rna_reward_fn/mean": 0.5778753757476807,
      "rewards/rna_reward_fn/std": 0.3692671060562134,
      "step": 22
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 376.0,
      "completions/max_terminated_length": 376.0,
      "completions/mean_length": 135.46875,
      "completions/mean_terminated_length": 135.46875,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "entropy": 0.23743800073862076,
      "epoch": 0.27058823529411763,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.36481839418411255,
      "learning_rate": 9.137254901960783e-07,
      "loss": -0.0,
      "num_tokens": 3466336.0,
      "reward": 0.6230462193489075,
      "reward_std": 0.27385085821151733,
      "rewards/rna_reward_fn/mean": 0.6230462193489075,
      "rewards/rna_reward_fn/std": 0.35384857654571533,
      "step": 23
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 399.0,
      "completions/max_terminated_length": 399.0,
      "completions/mean_length": 159.25,
      "completions/mean_terminated_length": 159.25,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "entropy": 0.2592047303915024,
      "epoch": 0.2823529411764706,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.40386804938316345,
      "learning_rate": 9.098039215686274e-07,
      "loss": -0.0,
      "num_tokens": 3630432.0,
      "reward": 0.587247908115387,
      "reward_std": 0.26836222410202026,
      "rewards/rna_reward_fn/mean": 0.587247908115387,
      "rewards/rna_reward_fn/std": 0.3811717927455902,
      "step": 24
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 400.0,
      "completions/max_terminated_length": 400.0,
      "completions/mean_length": 152.375,
      "completions/mean_terminated_length": 152.375,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "entropy": 0.23664871603250504,
      "epoch": 0.29411764705882354,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.514864444732666,
      "learning_rate": 9.058823529411764e-07,
      "loss": -0.0,
      "num_tokens": 3787488.0,
      "reward": 0.6044737696647644,
      "reward_std": 0.2556478679180145,
      "rewards/rna_reward_fn/mean": 0.6044737696647644,
      "rewards/rna_reward_fn/std": 0.3558889329433441,
      "step": 25
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 418.0,
      "completions/max_terminated_length": 418.0,
      "completions/mean_length": 140.5,
      "completions/mean_terminated_length": 140.5,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "entropy": 0.2437874600291252,
      "epoch": 0.3058823529411765,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4290100038051605,
      "learning_rate": 9.019607843137255e-07,
      "loss": -0.0,
      "num_tokens": 3932384.0,
      "reward": 0.583857536315918,
      "reward_std": 0.2450568526983261,
      "rewards/rna_reward_fn/mean": 0.583857536315918,
      "rewards/rna_reward_fn/std": 0.3653680384159088,
      "step": 26
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 472.0,
      "completions/max_terminated_length": 472.0,
      "completions/mean_length": 164.8125,
      "completions/mean_terminated_length": 164.8125,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "entropy": 0.24944818764925003,
      "epoch": 0.3176470588235294,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.42284926772117615,
      "learning_rate": 8.980392156862745e-07,
      "loss": -0.0,
      "num_tokens": 4102176.0,
      "reward": 0.5925735235214233,
      "reward_std": 0.2968187630176544,
      "rewards/rna_reward_fn/mean": 0.5925735235214233,
      "rewards/rna_reward_fn/std": 0.3608212471008301,
      "step": 27
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 397.0,
      "completions/max_terminated_length": 397.0,
      "completions/mean_length": 146.1875,
      "completions/mean_terminated_length": 146.1875,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "entropy": 0.22080854326486588,
      "epoch": 0.32941176470588235,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4605961740016937,
      "learning_rate": 8.941176470588236e-07,
      "loss": 0.0,
      "num_tokens": 4252896.0,
      "reward": 0.5584173202514648,
      "reward_std": 0.2890748083591461,
      "rewards/rna_reward_fn/mean": 0.5584173202514648,
      "rewards/rna_reward_fn/std": 0.3958645462989807,
      "step": 28
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 175.90625,
      "completions/mean_terminated_length": 175.90625,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "entropy": 0.2321019321680069,
      "epoch": 0.3411764705882353,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5582552552223206,
      "learning_rate": 8.901960784313724e-07,
      "loss": 0.0,
      "num_tokens": 4434048.0,
      "reward": 0.5966294407844543,
      "reward_std": 0.2823025584220886,
      "rewards/rna_reward_fn/mean": 0.5966294407844543,
      "rewards/rna_reward_fn/std": 0.3560717701911926,
      "step": 29
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 174.1875,
      "completions/mean_terminated_length": 174.1875,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "entropy": 0.21510899811983109,
      "epoch": 0.35294117647058826,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.49061208963394165,
      "learning_rate": 8.862745098039215e-07,
      "loss": -0.0,
      "num_tokens": 4613440.0,
      "reward": 0.5848400592803955,
      "reward_std": 0.267974317073822,
      "rewards/rna_reward_fn/mean": 0.5848400592803955,
      "rewards/rna_reward_fn/std": 0.37775954604148865,
      "step": 30
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 490.0,
      "completions/max_terminated_length": 490.0,
      "completions/mean_length": 163.15625,
      "completions/mean_terminated_length": 163.15625,
      "completions/min_length": 32.0,
      "completions/min_terminated_length": 32.0,
      "entropy": 0.2507341653108597,
      "epoch": 0.36470588235294116,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.603717029094696,
      "learning_rate": 8.823529411764705e-07,
      "loss": 0.0,
      "num_tokens": 4781536.0,
      "reward": 0.6572607159614563,
      "reward_std": 0.2553848624229431,
      "rewards/rna_reward_fn/mean": 0.6572607159614563,
      "rewards/rna_reward_fn/std": 0.3443078398704529,
      "step": 31
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 170.34375,
      "completions/mean_terminated_length": 170.34375,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "entropy": 0.2254045456647873,
      "epoch": 0.3764705882352941,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5129714608192444,
      "learning_rate": 8.784313725490196e-07,
      "loss": -0.0,
      "num_tokens": 4956992.0,
      "reward": 0.6237974762916565,
      "reward_std": 0.2781754732131958,
      "rewards/rna_reward_fn/mean": 0.6237974762916565,
      "rewards/rna_reward_fn/std": 0.37038782238960266,
      "step": 32
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 500.0,
      "completions/max_terminated_length": 500.0,
      "completions/mean_length": 140.96875,
      "completions/mean_terminated_length": 140.96875,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "entropy": 0.23444515466690063,
      "epoch": 0.38823529411764707,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5718296766281128,
      "learning_rate": 8.745098039215686e-07,
      "loss": -0.0,
      "num_tokens": 5102368.0,
      "reward": 0.663845956325531,
      "reward_std": 0.23731249570846558,
      "rewards/rna_reward_fn/mean": 0.6638458967208862,
      "rewards/rna_reward_fn/std": 0.3386061191558838,
      "step": 33
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 426.0,
      "completions/max_terminated_length": 426.0,
      "completions/mean_length": 135.84375,
      "completions/mean_terminated_length": 135.84375,
      "completions/min_length": 52.0,
      "completions/min_terminated_length": 52.0,
      "entropy": 0.21551413834095,
      "epoch": 0.4,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.48484402894973755,
      "learning_rate": 8.705882352941177e-07,
      "loss": 0.0,
      "num_tokens": 5242496.0,
      "reward": 0.5733575224876404,
      "reward_std": 0.2985653281211853,
      "rewards/rna_reward_fn/mean": 0.5733575224876404,
      "rewards/rna_reward_fn/std": 0.3665997385978699,
      "step": 34
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 128.34375,
      "completions/mean_terminated_length": 128.34375,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "entropy": 0.19232773780822754,
      "epoch": 0.4117647058823529,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3821423351764679,
      "learning_rate": 8.666666666666667e-07,
      "loss": 0.0,
      "num_tokens": 5374944.0,
      "reward": 0.6459628939628601,
      "reward_std": 0.27456825971603394,
      "rewards/rna_reward_fn/mean": 0.6459628939628601,
      "rewards/rna_reward_fn/std": 0.3492187559604645,
      "step": 35
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 117.28125,
      "completions/mean_terminated_length": 117.28125,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "entropy": 0.2170068845152855,
      "epoch": 0.4235294117647059,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.519378125667572,
      "learning_rate": 8.627450980392156e-07,
      "loss": -0.0,
      "num_tokens": 5496064.0,
      "reward": 0.6556386947631836,
      "reward_std": 0.2442726194858551,
      "rewards/rna_reward_fn/mean": 0.6556386947631836,
      "rewards/rna_reward_fn/std": 0.3574485182762146,
      "step": 36
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 143.75,
      "completions/mean_terminated_length": 143.75,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "entropy": 0.23470622301101685,
      "epoch": 0.43529411764705883,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4268864691257477,
      "learning_rate": 8.588235294117646e-07,
      "loss": 0.0,
      "num_tokens": 5644288.0,
      "reward": 0.6998727917671204,
      "reward_std": 0.2536011040210724,
      "rewards/rna_reward_fn/mean": 0.6998728513717651,
      "rewards/rna_reward_fn/std": 0.34483227133750916,
      "step": 37
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 161.84375,
      "completions/mean_terminated_length": 161.84375,
      "completions/min_length": 38.0,
      "completions/min_terminated_length": 38.0,
      "entropy": 0.20661279559135437,
      "epoch": 0.4470588235294118,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.49551478028297424,
      "learning_rate": 8.549019607843136e-07,
      "loss": 0.0,
      "num_tokens": 5811040.0,
      "reward": 0.60715651512146,
      "reward_std": 0.2498263716697693,
      "rewards/rna_reward_fn/mean": 0.60715651512146,
      "rewards/rna_reward_fn/std": 0.3692743182182312,
      "step": 38
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 169.09375,
      "completions/mean_terminated_length": 169.09375,
      "completions/min_length": 38.0,
      "completions/min_terminated_length": 38.0,
      "entropy": 0.22686513513326645,
      "epoch": 0.4588235294117647,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.539655864238739,
      "learning_rate": 8.509803921568627e-07,
      "loss": 0.0,
      "num_tokens": 5985216.0,
      "reward": 0.606254518032074,
      "reward_std": 0.27362608909606934,
      "rewards/rna_reward_fn/mean": 0.606254518032074,
      "rewards/rna_reward_fn/std": 0.37834590673446655,
      "step": 39
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 158.625,
      "completions/mean_terminated_length": 158.625,
      "completions/min_length": 43.0,
      "completions/min_terminated_length": 43.0,
      "entropy": 0.20522872358560562,
      "epoch": 0.47058823529411764,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4427753686904907,
      "learning_rate": 8.470588235294117e-07,
      "loss": 0.0,
      "num_tokens": 6148672.0,
      "reward": 0.6244011521339417,
      "reward_std": 0.2686484158039093,
      "rewards/rna_reward_fn/mean": 0.6244011521339417,
      "rewards/rna_reward_fn/std": 0.3721536099910736,
      "step": 40
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 165.5,
      "completions/mean_terminated_length": 165.5,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "entropy": 0.22500251233577728,
      "epoch": 0.4823529411764706,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.8924270272254944,
      "learning_rate": 8.431372549019608e-07,
      "loss": 0.0,
      "num_tokens": 6319168.0,
      "reward": 0.5321128368377686,
      "reward_std": 0.29077643156051636,
      "rewards/rna_reward_fn/mean": 0.5321128368377686,
      "rewards/rna_reward_fn/std": 0.3840348422527313,
      "step": 41
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 160.90625,
      "completions/mean_terminated_length": 160.90625,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "entropy": 0.23232445865869522,
      "epoch": 0.49411764705882355,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4376697540283203,
      "learning_rate": 8.392156862745098e-07,
      "loss": 0.0,
      "num_tokens": 6484960.0,
      "reward": 0.6353960037231445,
      "reward_std": 0.2474566251039505,
      "rewards/rna_reward_fn/mean": 0.6353960037231445,
      "rewards/rna_reward_fn/std": 0.3577839136123657,
      "step": 42
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 156.9375,
      "completions/mean_terminated_length": 156.9375,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "entropy": 0.21899814903736115,
      "epoch": 0.5058823529411764,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5491610765457153,
      "learning_rate": 8.352941176470589e-07,
      "loss": -0.0,
      "num_tokens": 6646688.0,
      "reward": 0.6090617775917053,
      "reward_std": 0.2399156093597412,
      "rewards/rna_reward_fn/mean": 0.6090618371963501,
      "rewards/rna_reward_fn/std": 0.35401132702827454,
      "step": 43
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 161.28125,
      "completions/mean_terminated_length": 161.28125,
      "completions/min_length": 43.0,
      "completions/min_terminated_length": 43.0,
      "entropy": 0.2018352746963501,
      "epoch": 0.5176470588235295,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4728248715400696,
      "learning_rate": 8.313725490196078e-07,
      "loss": -0.0,
      "num_tokens": 6812864.0,
      "reward": 0.5414500832557678,
      "reward_std": 0.257457435131073,
      "rewards/rna_reward_fn/mean": 0.5414501428604126,
      "rewards/rna_reward_fn/std": 0.37554678320884705,
      "step": 44
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 144.53125,
      "completions/mean_terminated_length": 144.53125,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "entropy": 0.21590139716863632,
      "epoch": 0.5294117647058824,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.45613518357276917,
      "learning_rate": 8.274509803921567e-07,
      "loss": 0.0,
      "num_tokens": 6961888.0,
      "reward": 0.5840362310409546,
      "reward_std": 0.24920199811458588,
      "rewards/rna_reward_fn/mean": 0.5840362310409546,
      "rewards/rna_reward_fn/std": 0.3838988244533539,
      "step": 45
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 151.59375,
      "completions/mean_terminated_length": 151.59375,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "entropy": 0.20446214824914932,
      "epoch": 0.5411764705882353,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4725431799888611,
      "learning_rate": 8.235294117647058e-07,
      "loss": 0.0,
      "num_tokens": 7118144.0,
      "reward": 0.5587388277053833,
      "reward_std": 0.25771480798721313,
      "rewards/rna_reward_fn/mean": 0.5587388277053833,
      "rewards/rna_reward_fn/std": 0.3881581127643585,
      "step": 46
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 472.0,
      "completions/max_terminated_length": 472.0,
      "completions/mean_length": 148.09375,
      "completions/mean_terminated_length": 148.09375,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "entropy": 0.20715581625699997,
      "epoch": 0.5529411764705883,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5775709748268127,
      "learning_rate": 8.196078431372548e-07,
      "loss": -0.0,
      "num_tokens": 7270816.0,
      "reward": 0.6535854935646057,
      "reward_std": 0.23074793815612793,
      "rewards/rna_reward_fn/mean": 0.6535854339599609,
      "rewards/rna_reward_fn/std": 0.35560858249664307,
      "step": 47
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 148.25,
      "completions/mean_terminated_length": 148.25,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "entropy": 0.20631568133831024,
      "epoch": 0.5647058823529412,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5872889161109924,
      "learning_rate": 8.156862745098039e-07,
      "loss": -0.0,
      "num_tokens": 7423648.0,
      "reward": 0.5795817375183105,
      "reward_std": 0.26122066378593445,
      "rewards/rna_reward_fn/mean": 0.5795817375183105,
      "rewards/rna_reward_fn/std": 0.3758288025856018,
      "step": 48
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 362.0,
      "completions/max_terminated_length": 362.0,
      "completions/mean_length": 124.71875,
      "completions/mean_terminated_length": 124.71875,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "entropy": 0.19562938064336777,
      "epoch": 0.5764705882352941,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.450082391500473,
      "learning_rate": 8.117647058823529e-07,
      "loss": 0.0,
      "num_tokens": 7552384.0,
      "reward": 0.657599925994873,
      "reward_std": 0.24575895071029663,
      "rewards/rna_reward_fn/mean": 0.657599925994873,
      "rewards/rna_reward_fn/std": 0.31881189346313477,
      "step": 49
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 144.53125,
      "completions/mean_terminated_length": 144.53125,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "entropy": 0.212866373360157,
      "epoch": 0.5882352941176471,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4694586992263794,
      "learning_rate": 8.07843137254902e-07,
      "loss": -0.0,
      "num_tokens": 7701408.0,
      "reward": 0.5784563422203064,
      "reward_std": 0.2643548846244812,
      "rewards/rna_reward_fn/mean": 0.5784563422203064,
      "rewards/rna_reward_fn/std": 0.3683941066265106,
      "step": 50
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 138.21875,
      "completions/mean_terminated_length": 138.21875,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "entropy": 0.17988762259483337,
      "epoch": 0.6,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.44588983058929443,
      "learning_rate": 8.03921568627451e-07,
      "loss": 0.0,
      "num_tokens": 7843968.0,
      "reward": 0.6563807725906372,
      "reward_std": 0.2578202784061432,
      "rewards/rna_reward_fn/mean": 0.6563807725906372,
      "rewards/rna_reward_fn/std": 0.3404718339443207,
      "step": 51
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 181.0,
      "completions/mean_terminated_length": 181.0,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "entropy": 0.22444826364517212,
      "epoch": 0.611764705882353,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.49978184700012207,
      "learning_rate": 8e-07,
      "loss": -0.0,
      "num_tokens": 8030336.0,
      "reward": 0.6426054239273071,
      "reward_std": 0.2517712712287903,
      "rewards/rna_reward_fn/mean": 0.6426054239273071,
      "rewards/rna_reward_fn/std": 0.3629717528820038,
      "step": 52
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 185.40625,
      "completions/mean_terminated_length": 185.40625,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "entropy": 0.20722465217113495,
      "epoch": 0.6235294117647059,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.6321276426315308,
      "learning_rate": 7.960784313725489e-07,
      "loss": -0.0,
      "num_tokens": 8221216.0,
      "reward": 0.7105848789215088,
      "reward_std": 0.23574814200401306,
      "rewards/rna_reward_fn/mean": 0.7105848789215088,
      "rewards/rna_reward_fn/std": 0.3385322690010071,
      "step": 53
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 358.0,
      "completions/max_terminated_length": 358.0,
      "completions/mean_length": 148.125,
      "completions/mean_terminated_length": 148.125,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "entropy": 0.19676074385643005,
      "epoch": 0.6352941176470588,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.49395662546157837,
      "learning_rate": 7.92156862745098e-07,
      "loss": 0.0,
      "num_tokens": 8373920.0,
      "reward": 0.5770894885063171,
      "reward_std": 0.2644929885864258,
      "rewards/rna_reward_fn/mean": 0.5770894289016724,
      "rewards/rna_reward_fn/std": 0.3790797293186188,
      "step": 54
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 159.6875,
      "completions/mean_terminated_length": 159.6875,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "entropy": 0.18705828487873077,
      "epoch": 0.6470588235294118,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4197390079498291,
      "learning_rate": 7.88235294117647e-07,
      "loss": 0.0,
      "num_tokens": 8538464.0,
      "reward": 0.5764464139938354,
      "reward_std": 0.21550722420215607,
      "rewards/rna_reward_fn/mean": 0.5764464139938354,
      "rewards/rna_reward_fn/std": 0.364503413438797,
      "step": 55
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 146.125,
      "completions/mean_terminated_length": 146.125,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "entropy": 0.21118487417697906,
      "epoch": 0.6588235294117647,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.37146326899528503,
      "learning_rate": 7.84313725490196e-07,
      "loss": 0.0,
      "num_tokens": 8689120.0,
      "reward": 0.6104137897491455,
      "reward_std": 0.23754771053791046,
      "rewards/rna_reward_fn/mean": 0.6104137897491455,
      "rewards/rna_reward_fn/std": 0.3665221333503723,
      "step": 56
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 160.65625,
      "completions/mean_terminated_length": 160.65625,
      "completions/min_length": 41.0,
      "completions/min_terminated_length": 41.0,
      "entropy": 0.1945827156305313,
      "epoch": 0.6705882352941176,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4072308838367462,
      "learning_rate": 7.803921568627451e-07,
      "loss": 0.0,
      "num_tokens": 8854656.0,
      "reward": 0.6713041067123413,
      "reward_std": 0.2212895005941391,
      "rewards/rna_reward_fn/mean": 0.6713041067123413,
      "rewards/rna_reward_fn/std": 0.3392506539821625,
      "step": 57
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 142.125,
      "completions/mean_terminated_length": 142.125,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "entropy": 0.18257632106542587,
      "epoch": 0.6823529411764706,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4942987561225891,
      "learning_rate": 7.764705882352941e-07,
      "loss": 0.0,
      "num_tokens": 9001216.0,
      "reward": 0.6629120707511902,
      "reward_std": 0.22726097702980042,
      "rewards/rna_reward_fn/mean": 0.6629120707511902,
      "rewards/rna_reward_fn/std": 0.31348657608032227,
      "step": 58
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 147.03125,
      "completions/mean_terminated_length": 147.03125,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "entropy": 0.20158874243497849,
      "epoch": 0.6941176470588235,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5187806487083435,
      "learning_rate": 7.725490196078432e-07,
      "loss": 0.0,
      "num_tokens": 9152800.0,
      "reward": 0.6476730108261108,
      "reward_std": 0.24552714824676514,
      "rewards/rna_reward_fn/mean": 0.6476730108261108,
      "rewards/rna_reward_fn/std": 0.33643367886543274,
      "step": 59
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 399.0,
      "completions/max_terminated_length": 399.0,
      "completions/mean_length": 159.4375,
      "completions/mean_terminated_length": 159.4375,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "entropy": 0.18591003119945526,
      "epoch": 0.7058823529411765,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.36044302582740784,
      "learning_rate": 7.686274509803921e-07,
      "loss": 0.0,
      "num_tokens": 9317088.0,
      "reward": 0.6832787394523621,
      "reward_std": 0.22806429862976074,
      "rewards/rna_reward_fn/mean": 0.6832787394523621,
      "rewards/rna_reward_fn/std": 0.32348689436912537,
      "step": 60
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 474.0,
      "completions/max_terminated_length": 474.0,
      "completions/mean_length": 160.96875,
      "completions/mean_terminated_length": 160.96875,
      "completions/min_length": 36.0,
      "completions/min_terminated_length": 36.0,
      "entropy": 0.21002116054296494,
      "epoch": 0.7176470588235294,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5378114581108093,
      "learning_rate": 7.647058823529411e-07,
      "loss": -0.0,
      "num_tokens": 9482944.0,
      "reward": 0.6531599760055542,
      "reward_std": 0.22567519545555115,
      "rewards/rna_reward_fn/mean": 0.653160035610199,
      "rewards/rna_reward_fn/std": 0.33769848942756653,
      "step": 61
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 348.0,
      "completions/max_terminated_length": 348.0,
      "completions/mean_length": 116.75,
      "completions/mean_terminated_length": 116.75,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "entropy": 0.18150582909584045,
      "epoch": 0.7294117647058823,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.39785146713256836,
      "learning_rate": 7.607843137254901e-07,
      "loss": -0.0,
      "num_tokens": 9603520.0,
      "reward": 0.565564751625061,
      "reward_std": 0.2807776927947998,
      "rewards/rna_reward_fn/mean": 0.565564751625061,
      "rewards/rna_reward_fn/std": 0.38936248421669006,
      "step": 62
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 391.0,
      "completions/max_terminated_length": 391.0,
      "completions/mean_length": 147.78125,
      "completions/mean_terminated_length": 147.78125,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "entropy": 0.189855195581913,
      "epoch": 0.7411764705882353,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4473720192909241,
      "learning_rate": 7.568627450980392e-07,
      "loss": -0.0,
      "num_tokens": 9755872.0,
      "reward": 0.6822654008865356,
      "reward_std": 0.23419374227523804,
      "rewards/rna_reward_fn/mean": 0.6822654008865356,
      "rewards/rna_reward_fn/std": 0.32637539505958557,
      "step": 63
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 171.28125,
      "completions/mean_terminated_length": 171.28125,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "entropy": 0.19365741312503815,
      "epoch": 0.7529411764705882,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5170744061470032,
      "learning_rate": 7.529411764705882e-07,
      "loss": -0.0,
      "num_tokens": 9932288.0,
      "reward": 0.6570923328399658,
      "reward_std": 0.24268731474876404,
      "rewards/rna_reward_fn/mean": 0.6570923328399658,
      "rewards/rna_reward_fn/std": 0.3360862731933594,
      "step": 64
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 400.0,
      "completions/max_terminated_length": 400.0,
      "completions/mean_length": 138.5625,
      "completions/mean_terminated_length": 138.5625,
      "completions/min_length": 50.0,
      "completions/min_terminated_length": 50.0,
      "entropy": 0.15700556337833405,
      "epoch": 0.7647058823529411,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.669632077217102,
      "learning_rate": 7.490196078431373e-07,
      "loss": -0.0,
      "num_tokens": 10075200.0,
      "reward": 0.5884541273117065,
      "reward_std": 0.25077739357948303,
      "rewards/rna_reward_fn/mean": 0.5884541869163513,
      "rewards/rna_reward_fn/std": 0.3707042634487152,
      "step": 65
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 147.875,
      "completions/mean_terminated_length": 147.875,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "entropy": 0.1868809014558792,
      "epoch": 0.7764705882352941,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.496881365776062,
      "learning_rate": 7.450980392156863e-07,
      "loss": -0.0,
      "num_tokens": 10227648.0,
      "reward": 0.6189287900924683,
      "reward_std": 0.23646032810211182,
      "rewards/rna_reward_fn/mean": 0.6189287900924683,
      "rewards/rna_reward_fn/std": 0.3614950180053711,
      "step": 66
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 127.75,
      "completions/mean_terminated_length": 127.75,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "entropy": 0.17434925585985184,
      "epoch": 0.788235294117647,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5550652742385864,
      "learning_rate": 7.411764705882352e-07,
      "loss": 0.0,
      "num_tokens": 10359488.0,
      "reward": 0.5918734073638916,
      "reward_std": 0.2727334499359131,
      "rewards/rna_reward_fn/mean": 0.5918734073638916,
      "rewards/rna_reward_fn/std": 0.35672324895858765,
      "step": 67
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 151.96875,
      "completions/mean_terminated_length": 151.96875,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "entropy": 0.17505493760108948,
      "epoch": 0.8,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3833948075771332,
      "learning_rate": 7.372549019607843e-07,
      "loss": -0.0,
      "num_tokens": 10516128.0,
      "reward": 0.7000205516815186,
      "reward_std": 0.23740704357624054,
      "rewards/rna_reward_fn/mean": 0.7000205516815186,
      "rewards/rna_reward_fn/std": 0.3234153985977173,
      "step": 68
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 141.15625,
      "completions/mean_terminated_length": 141.15625,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "entropy": 0.17628953605890274,
      "epoch": 0.8117647058823529,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3673928678035736,
      "learning_rate": 7.333333333333332e-07,
      "loss": 0.0,
      "num_tokens": 10661696.0,
      "reward": 0.6538941860198975,
      "reward_std": 0.19288064539432526,
      "rewards/rna_reward_fn/mean": 0.6538941860198975,
      "rewards/rna_reward_fn/std": 0.3515564203262329,
      "step": 69
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 399.0,
      "completions/max_terminated_length": 399.0,
      "completions/mean_length": 195.53125,
      "completions/mean_terminated_length": 195.53125,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "entropy": 0.18974752724170685,
      "epoch": 0.8235294117647058,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.719358503818512,
      "learning_rate": 7.294117647058823e-07,
      "loss": -0.0,
      "num_tokens": 10862944.0,
      "reward": 0.5886421203613281,
      "reward_std": 0.23114809393882751,
      "rewards/rna_reward_fn/mean": 0.5886421203613281,
      "rewards/rna_reward_fn/std": 0.36729925870895386,
      "step": 70
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 156.46875,
      "completions/mean_terminated_length": 156.46875,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "entropy": 0.17211396992206573,
      "epoch": 0.8352941176470589,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4566245377063751,
      "learning_rate": 7.254901960784313e-07,
      "loss": 0.0,
      "num_tokens": 11024192.0,
      "reward": 0.6206304430961609,
      "reward_std": 0.20096182823181152,
      "rewards/rna_reward_fn/mean": 0.6206304430961609,
      "rewards/rna_reward_fn/std": 0.3349648714065552,
      "step": 71
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 124.84375,
      "completions/mean_terminated_length": 124.84375,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "entropy": 0.16766826063394547,
      "epoch": 0.8470588235294118,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4164656698703766,
      "learning_rate": 7.215686274509804e-07,
      "loss": -0.0,
      "num_tokens": 11153056.0,
      "reward": 0.6351762413978577,
      "reward_std": 0.2213377058506012,
      "rewards/rna_reward_fn/mean": 0.6351762413978577,
      "rewards/rna_reward_fn/std": 0.3493310809135437,
      "step": 72
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 412.0,
      "completions/max_terminated_length": 412.0,
      "completions/mean_length": 129.65625,
      "completions/mean_terminated_length": 129.65625,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "entropy": 0.16023673117160797,
      "epoch": 0.8588235294117647,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.41499394178390503,
      "learning_rate": 7.176470588235294e-07,
      "loss": 0.0,
      "num_tokens": 11286848.0,
      "reward": 0.6752070784568787,
      "reward_std": 0.24617840349674225,
      "rewards/rna_reward_fn/mean": 0.6752070784568787,
      "rewards/rna_reward_fn/std": 0.34732139110565186,
      "step": 73
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 164.9375,
      "completions/mean_terminated_length": 164.9375,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "entropy": 0.18363939225673676,
      "epoch": 0.8705882352941177,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.45577237010002136,
      "learning_rate": 7.137254901960785e-07,
      "loss": -0.0,
      "num_tokens": 11456768.0,
      "reward": 0.5772933959960938,
      "reward_std": 0.23847423493862152,
      "rewards/rna_reward_fn/mean": 0.5772933959960938,
      "rewards/rna_reward_fn/std": 0.3823261260986328,
      "step": 74
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 492.0,
      "completions/max_terminated_length": 492.0,
      "completions/mean_length": 188.28125,
      "completions/mean_terminated_length": 188.28125,
      "completions/min_length": 55.0,
      "completions/min_terminated_length": 55.0,
      "entropy": 0.1838960349559784,
      "epoch": 0.8823529411764706,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5237012505531311,
      "learning_rate": 7.098039215686274e-07,
      "loss": 0.0,
      "num_tokens": 11650592.0,
      "reward": 0.6181286573410034,
      "reward_std": 0.2555590569972992,
      "rewards/rna_reward_fn/mean": 0.6181286573410034,
      "rewards/rna_reward_fn/std": 0.37019652128219604,
      "step": 75
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 182.1875,
      "completions/mean_terminated_length": 182.1875,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "entropy": 0.1790659874677658,
      "epoch": 0.8941176470588236,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4818723499774933,
      "learning_rate": 7.058823529411765e-07,
      "loss": -0.0,
      "num_tokens": 11838176.0,
      "reward": 0.578412652015686,
      "reward_std": 0.22860457003116608,
      "rewards/rna_reward_fn/mean": 0.578412652015686,
      "rewards/rna_reward_fn/std": 0.35265785455703735,
      "step": 76
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 200.21875,
      "completions/mean_terminated_length": 200.21875,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "entropy": 0.18565233796834946,
      "epoch": 0.9058823529411765,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.7948216795921326,
      "learning_rate": 7.019607843137254e-07,
      "loss": 0.0,
      "num_tokens": 12044224.0,
      "reward": 0.6187993288040161,
      "reward_std": 0.2622474431991577,
      "rewards/rna_reward_fn/mean": 0.6187993288040161,
      "rewards/rna_reward_fn/std": 0.326750248670578,
      "step": 77
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 145.8125,
      "completions/mean_terminated_length": 145.8125,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "entropy": 0.17154797911643982,
      "epoch": 0.9176470588235294,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.47565799951553345,
      "learning_rate": 6.980392156862744e-07,
      "loss": -0.0,
      "num_tokens": 12194560.0,
      "reward": 0.5971746444702148,
      "reward_std": 0.18512360751628876,
      "rewards/rna_reward_fn/mean": 0.5971747040748596,
      "rewards/rna_reward_fn/std": 0.3710518777370453,
      "step": 78
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 373.0,
      "completions/max_terminated_length": 373.0,
      "completions/mean_length": 128.71875,
      "completions/mean_terminated_length": 128.71875,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "entropy": 0.15196984261274338,
      "epoch": 0.9294117647058824,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4943343698978424,
      "learning_rate": 6.941176470588235e-07,
      "loss": -0.0,
      "num_tokens": 12327392.0,
      "reward": 0.6471496820449829,
      "reward_std": 0.22329822182655334,
      "rewards/rna_reward_fn/mean": 0.6471496820449829,
      "rewards/rna_reward_fn/std": 0.33536407351493835,
      "step": 79
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 400.0,
      "completions/max_terminated_length": 400.0,
      "completions/mean_length": 137.84375,
      "completions/mean_terminated_length": 137.84375,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "entropy": 0.16948848217725754,
      "epoch": 0.9411764705882353,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4759957492351532,
      "learning_rate": 6.901960784313725e-07,
      "loss": -0.0,
      "num_tokens": 12469568.0,
      "reward": 0.659608006477356,
      "reward_std": 0.18602336943149567,
      "rewards/rna_reward_fn/mean": 0.659608006477356,
      "rewards/rna_reward_fn/std": 0.3731914460659027,
      "step": 80
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 146.75,
      "completions/mean_terminated_length": 146.75,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "entropy": 0.18501683324575424,
      "epoch": 0.9529411764705882,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.43785735964775085,
      "learning_rate": 6.862745098039216e-07,
      "loss": 0.0,
      "num_tokens": 12620864.0,
      "reward": 0.620478630065918,
      "reward_std": 0.22393935918807983,
      "rewards/rna_reward_fn/mean": 0.620478630065918,
      "rewards/rna_reward_fn/std": 0.35981276631355286,
      "step": 81
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 150.1875,
      "completions/mean_terminated_length": 150.1875,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "entropy": 0.1829531416296959,
      "epoch": 0.9647058823529412,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4392038583755493,
      "learning_rate": 6.823529411764706e-07,
      "loss": 0.0,
      "num_tokens": 12775680.0,
      "reward": 0.6712214350700378,
      "reward_std": 0.2174052894115448,
      "rewards/rna_reward_fn/mean": 0.6712214946746826,
      "rewards/rna_reward_fn/std": 0.3370954990386963,
      "step": 82
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 391.0,
      "completions/max_terminated_length": 391.0,
      "completions/mean_length": 141.8125,
      "completions/mean_terminated_length": 141.8125,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "entropy": 0.1686822921037674,
      "epoch": 0.9764705882352941,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4484212398529053,
      "learning_rate": 6.784313725490196e-07,
      "loss": -0.0,
      "num_tokens": 12921920.0,
      "reward": 0.6464422345161438,
      "reward_std": 0.2250806838274002,
      "rewards/rna_reward_fn/mean": 0.6464422345161438,
      "rewards/rna_reward_fn/std": 0.3622319996356964,
      "step": 83
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 166.65625,
      "completions/mean_terminated_length": 166.65625,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "entropy": 0.17645781487226486,
      "epoch": 0.9882352941176471,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.7668079137802124,
      "learning_rate": 6.745098039215686e-07,
      "loss": 0.0,
      "num_tokens": 13093600.0,
      "reward": 0.6832870244979858,
      "reward_std": 0.25750601291656494,
      "rewards/rna_reward_fn/mean": 0.6832869648933411,
      "rewards/rna_reward_fn/std": 0.3430787920951843,
      "step": 84
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 167.96875,
      "completions/mean_terminated_length": 167.96875,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "entropy": 0.17668870836496353,
      "epoch": 1.0,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.43097105622291565,
      "learning_rate": 6.705882352941176e-07,
      "loss": 0.0,
      "num_tokens": 13266624.0,
      "reward": 0.5539568662643433,
      "reward_std": 0.22693298757076263,
      "rewards/rna_reward_fn/mean": 0.5539568066596985,
      "rewards/rna_reward_fn/std": 0.38347697257995605,
      "step": 85
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 182.3125,
      "completions/mean_terminated_length": 182.3125,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "entropy": 0.1827656850218773,
      "epoch": 1.011764705882353,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5608375668525696,
      "learning_rate": 6.666666666666666e-07,
      "loss": 0.0,
      "num_tokens": 13454336.0,
      "reward": 0.7320628762245178,
      "reward_std": 0.22256582975387573,
      "rewards/rna_reward_fn/mean": 0.7320628762245178,
      "rewards/rna_reward_fn/std": 0.30846187472343445,
      "step": 86
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 140.625,
      "completions/mean_terminated_length": 140.625,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "entropy": 0.18483393639326096,
      "epoch": 1.0235294117647058,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4667685627937317,
      "learning_rate": 6.627450980392156e-07,
      "loss": 0.0,
      "num_tokens": 13599360.0,
      "reward": 0.6894385814666748,
      "reward_std": 0.20523157715797424,
      "rewards/rna_reward_fn/mean": 0.6894385814666748,
      "rewards/rna_reward_fn/std": 0.3155847191810608,
      "step": 87
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 150.46875,
      "completions/mean_terminated_length": 150.46875,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "entropy": 0.16182925552129745,
      "epoch": 1.035294117647059,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.6056375503540039,
      "learning_rate": 6.588235294117647e-07,
      "loss": -0.0,
      "num_tokens": 13754464.0,
      "reward": 0.6177388429641724,
      "reward_std": 0.24611341953277588,
      "rewards/rna_reward_fn/mean": 0.6177388429641724,
      "rewards/rna_reward_fn/std": 0.3494950830936432,
      "step": 88
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 144.09375,
      "completions/mean_terminated_length": 144.09375,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "entropy": 0.17024414986371994,
      "epoch": 1.0470588235294118,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4357620179653168,
      "learning_rate": 6.549019607843137e-07,
      "loss": -0.0,
      "num_tokens": 13903040.0,
      "reward": 0.611262857913971,
      "reward_std": 0.19428220391273499,
      "rewards/rna_reward_fn/mean": 0.611262857913971,
      "rewards/rna_reward_fn/std": 0.3793390393257141,
      "step": 89
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 132.5625,
      "completions/mean_terminated_length": 132.5625,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "entropy": 0.16757714748382568,
      "epoch": 1.0588235294117647,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.440759539604187,
      "learning_rate": 6.509803921568627e-07,
      "loss": -0.0,
      "num_tokens": 14039808.0,
      "reward": 0.6882448196411133,
      "reward_std": 0.19556942582130432,
      "rewards/rna_reward_fn/mean": 0.6882448196411133,
      "rewards/rna_reward_fn/std": 0.32508718967437744,
      "step": 90
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 143.78125,
      "completions/mean_terminated_length": 143.78125,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "entropy": 0.1645500287413597,
      "epoch": 1.0705882352941176,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5613058805465698,
      "learning_rate": 6.470588235294117e-07,
      "loss": 0.0,
      "num_tokens": 14188064.0,
      "reward": 0.6789584159851074,
      "reward_std": 0.19199398159980774,
      "rewards/rna_reward_fn/mean": 0.6789584159851074,
      "rewards/rna_reward_fn/std": 0.3482169210910797,
      "step": 91
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 399.0,
      "completions/max_terminated_length": 399.0,
      "completions/mean_length": 118.34375,
      "completions/mean_terminated_length": 118.34375,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "entropy": 0.14176590740680695,
      "epoch": 1.0823529411764705,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4092370867729187,
      "learning_rate": 6.431372549019608e-07,
      "loss": -0.0,
      "num_tokens": 14310272.0,
      "reward": 0.650740921497345,
      "reward_std": 0.18103614449501038,
      "rewards/rna_reward_fn/mean": 0.650740921497345,
      "rewards/rna_reward_fn/std": 0.32734215259552,
      "step": 92
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 154.3125,
      "completions/mean_terminated_length": 154.3125,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "entropy": 0.176346056163311,
      "epoch": 1.0941176470588236,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4459090232849121,
      "learning_rate": 6.392156862745097e-07,
      "loss": 0.0,
      "num_tokens": 14469312.0,
      "reward": 0.6732466816902161,
      "reward_std": 0.22345304489135742,
      "rewards/rna_reward_fn/mean": 0.6732466816902161,
      "rewards/rna_reward_fn/std": 0.3369784951210022,
      "step": 93
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 145.3125,
      "completions/mean_terminated_length": 145.3125,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "entropy": 0.1685405969619751,
      "epoch": 1.1058823529411765,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5043669939041138,
      "learning_rate": 6.352941176470588e-07,
      "loss": -0.0,
      "num_tokens": 14619136.0,
      "reward": 0.677271842956543,
      "reward_std": 0.20296773314476013,
      "rewards/rna_reward_fn/mean": 0.677271842956543,
      "rewards/rna_reward_fn/std": 0.320669025182724,
      "step": 94
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 170.1875,
      "completions/mean_terminated_length": 170.1875,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "entropy": 0.18431222438812256,
      "epoch": 1.1176470588235294,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.6736860275268555,
      "learning_rate": 6.313725490196078e-07,
      "loss": -0.0,
      "num_tokens": 14794432.0,
      "reward": 0.6684234738349915,
      "reward_std": 0.259125292301178,
      "rewards/rna_reward_fn/mean": 0.6684235334396362,
      "rewards/rna_reward_fn/std": 0.34210121631622314,
      "step": 95
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 157.6875,
      "completions/mean_terminated_length": 157.6875,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "entropy": 0.16836901009082794,
      "epoch": 1.1294117647058823,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4569699764251709,
      "learning_rate": 6.274509803921569e-07,
      "loss": -0.0,
      "num_tokens": 14956928.0,
      "reward": 0.68538498878479,
      "reward_std": 0.1874302327632904,
      "rewards/rna_reward_fn/mean": 0.68538498878479,
      "rewards/rna_reward_fn/std": 0.295845091342926,
      "step": 96
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 393.0,
      "completions/max_terminated_length": 393.0,
      "completions/mean_length": 140.21875,
      "completions/mean_terminated_length": 140.21875,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "entropy": 0.158738911151886,
      "epoch": 1.1411764705882352,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4725809395313263,
      "learning_rate": 6.235294117647059e-07,
      "loss": 0.0,
      "num_tokens": 15101536.0,
      "reward": 0.6654532551765442,
      "reward_std": 0.18864062428474426,
      "rewards/rna_reward_fn/mean": 0.6654532551765442,
      "rewards/rna_reward_fn/std": 0.3371845781803131,
      "step": 97
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 162.8125,
      "completions/mean_terminated_length": 162.8125,
      "completions/min_length": 42.0,
      "completions/min_terminated_length": 42.0,
      "entropy": 0.17738928645849228,
      "epoch": 1.1529411764705881,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5798309445381165,
      "learning_rate": 6.196078431372548e-07,
      "loss": -0.0,
      "num_tokens": 15269280.0,
      "reward": 0.7147358655929565,
      "reward_std": 0.21203583478927612,
      "rewards/rna_reward_fn/mean": 0.7147358655929565,
      "rewards/rna_reward_fn/std": 0.33255505561828613,
      "step": 98
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 168.03125,
      "completions/mean_terminated_length": 168.03125,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "entropy": 0.17116892337799072,
      "epoch": 1.1647058823529413,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5520632863044739,
      "learning_rate": 6.156862745098039e-07,
      "loss": -0.0,
      "num_tokens": 15442368.0,
      "reward": 0.6365219950675964,
      "reward_std": 0.20218491554260254,
      "rewards/rna_reward_fn/mean": 0.6365219950675964,
      "rewards/rna_reward_fn/std": 0.35175827145576477,
      "step": 99
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 397.0,
      "completions/max_terminated_length": 397.0,
      "completions/mean_length": 138.0,
      "completions/mean_terminated_length": 138.0,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "entropy": 0.17306677252054214,
      "epoch": 1.1764705882352942,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4389256238937378,
      "learning_rate": 6.117647058823529e-07,
      "loss": 0.0,
      "num_tokens": 15584704.0,
      "reward": 0.7388399839401245,
      "reward_std": 0.16607630252838135,
      "rewards/rna_reward_fn/mean": 0.7388399839401245,
      "rewards/rna_reward_fn/std": 0.2576732635498047,
      "step": 100
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 392.0,
      "completions/max_terminated_length": 392.0,
      "completions/mean_length": 137.40625,
      "completions/mean_terminated_length": 137.40625,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "entropy": 0.15397901087999344,
      "epoch": 1.188235294117647,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5594757795333862,
      "learning_rate": 6.078431372549019e-07,
      "loss": -0.0,
      "num_tokens": 15726432.0,
      "reward": 0.7157045602798462,
      "reward_std": 0.22128766775131226,
      "rewards/rna_reward_fn/mean": 0.7157045602798462,
      "rewards/rna_reward_fn/std": 0.2969537079334259,
      "step": 101
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 400.0,
      "completions/max_terminated_length": 400.0,
      "completions/mean_length": 127.78125,
      "completions/mean_terminated_length": 127.78125,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "entropy": 0.17225481569766998,
      "epoch": 1.2,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.40622541308403015,
      "learning_rate": 6.039215686274509e-07,
      "loss": -0.0,
      "num_tokens": 15858304.0,
      "reward": 0.7043038010597229,
      "reward_std": 0.22727924585342407,
      "rewards/rna_reward_fn/mean": 0.7043038606643677,
      "rewards/rna_reward_fn/std": 0.33978909254074097,
      "step": 102
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 167.625,
      "completions/mean_terminated_length": 167.625,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "entropy": 0.17464321851730347,
      "epoch": 1.2117647058823529,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4603181779384613,
      "learning_rate": 6e-07,
      "loss": -0.0,
      "num_tokens": 16030976.0,
      "reward": 0.61054527759552,
      "reward_std": 0.22179073095321655,
      "rewards/rna_reward_fn/mean": 0.61054527759552,
      "rewards/rna_reward_fn/std": 0.37210676074028015,
      "step": 103
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 156.8125,
      "completions/mean_terminated_length": 156.8125,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "entropy": 0.1658085659146309,
      "epoch": 1.223529411764706,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4843849539756775,
      "learning_rate": 5.96078431372549e-07,
      "loss": -0.0,
      "num_tokens": 16192576.0,
      "reward": 0.6978532075881958,
      "reward_std": 0.1981123685836792,
      "rewards/rna_reward_fn/mean": 0.6978532671928406,
      "rewards/rna_reward_fn/std": 0.3141247630119324,
      "step": 104
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 181.1875,
      "completions/mean_terminated_length": 181.1875,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "entropy": 0.16212371736764908,
      "epoch": 1.2352941176470589,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5290284752845764,
      "learning_rate": 5.921568627450981e-07,
      "loss": 0.0,
      "num_tokens": 16379136.0,
      "reward": 0.6463083028793335,
      "reward_std": 0.1896321177482605,
      "rewards/rna_reward_fn/mean": 0.6463083028793335,
      "rewards/rna_reward_fn/std": 0.36457034945487976,
      "step": 105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 282.0,
      "completions/max_terminated_length": 282.0,
      "completions/mean_length": 124.3125,
      "completions/mean_terminated_length": 124.3125,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "entropy": 0.15162574499845505,
      "epoch": 1.2470588235294118,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.47445422410964966,
      "learning_rate": 5.88235294117647e-07,
      "loss": 0.0,
      "num_tokens": 16507456.0,
      "reward": 0.672465980052948,
      "reward_std": 0.20273976027965546,
      "rewards/rna_reward_fn/mean": 0.6724659204483032,
      "rewards/rna_reward_fn/std": 0.3352026343345642,
      "step": 106
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 154.25,
      "completions/mean_terminated_length": 154.25,
      "completions/min_length": 32.0,
      "completions/min_terminated_length": 32.0,
      "entropy": 0.1651393622159958,
      "epoch": 1.2588235294117647,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.48081472516059875,
      "learning_rate": 5.843137254901961e-07,
      "loss": -0.0,
      "num_tokens": 16666432.0,
      "reward": 0.6745295524597168,
      "reward_std": 0.21466964483261108,
      "rewards/rna_reward_fn/mean": 0.6745295524597168,
      "rewards/rna_reward_fn/std": 0.3604423701763153,
      "step": 107
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 176.34375,
      "completions/mean_terminated_length": 176.34375,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "entropy": 0.16943742334842682,
      "epoch": 1.2705882352941176,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4931647777557373,
      "learning_rate": 5.803921568627451e-07,
      "loss": 0.0,
      "num_tokens": 16848032.0,
      "reward": 0.6875256896018982,
      "reward_std": 0.2435401976108551,
      "rewards/rna_reward_fn/mean": 0.6875256896018982,
      "rewards/rna_reward_fn/std": 0.3279384672641754,
      "step": 108
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 158.09375,
      "completions/mean_terminated_length": 158.09375,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "entropy": 0.17465446144342422,
      "epoch": 1.2823529411764705,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5001822113990784,
      "learning_rate": 5.76470588235294e-07,
      "loss": 0.0,
      "num_tokens": 17010944.0,
      "reward": 0.6029446125030518,
      "reward_std": 0.1757221221923828,
      "rewards/rna_reward_fn/mean": 0.6029446125030518,
      "rewards/rna_reward_fn/std": 0.35652756690979004,
      "step": 109
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 426.0,
      "completions/max_terminated_length": 426.0,
      "completions/mean_length": 167.40625,
      "completions/mean_terminated_length": 167.40625,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "entropy": 0.16541431099176407,
      "epoch": 1.2941176470588236,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4689631760120392,
      "learning_rate": 5.725490196078431e-07,
      "loss": -0.0,
      "num_tokens": 17183392.0,
      "reward": 0.6704152226448059,
      "reward_std": 0.20997245609760284,
      "rewards/rna_reward_fn/mean": 0.6704152226448059,
      "rewards/rna_reward_fn/std": 0.32471874356269836,
      "step": 110
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 398.0,
      "completions/max_terminated_length": 398.0,
      "completions/mean_length": 141.71875,
      "completions/mean_terminated_length": 141.71875,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "entropy": 0.1647869274020195,
      "epoch": 1.3058823529411765,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5760033130645752,
      "learning_rate": 5.686274509803921e-07,
      "loss": -0.0,
      "num_tokens": 17329536.0,
      "reward": 0.6938682198524475,
      "reward_std": 0.20044496655464172,
      "rewards/rna_reward_fn/mean": 0.6938682198524475,
      "rewards/rna_reward_fn/std": 0.32881274819374084,
      "step": 111
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 115.96875,
      "completions/mean_terminated_length": 115.96875,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "entropy": 0.1390109360218048,
      "epoch": 1.3176470588235294,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5902699828147888,
      "learning_rate": 5.647058823529412e-07,
      "loss": 0.0,
      "num_tokens": 17449312.0,
      "reward": 0.651271402835846,
      "reward_std": 0.17913030087947845,
      "rewards/rna_reward_fn/mean": 0.651271402835846,
      "rewards/rna_reward_fn/std": 0.3490009009838104,
      "step": 112
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 472.0,
      "completions/max_terminated_length": 472.0,
      "completions/mean_length": 179.8125,
      "completions/mean_terminated_length": 179.8125,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "entropy": 0.16215970367193222,
      "epoch": 1.3294117647058823,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.6261849403381348,
      "learning_rate": 5.607843137254902e-07,
      "loss": -0.0,
      "num_tokens": 17634464.0,
      "reward": 0.6400759220123291,
      "reward_std": 0.2095731794834137,
      "rewards/rna_reward_fn/mean": 0.6400759220123291,
      "rewards/rna_reward_fn/std": 0.34743088483810425,
      "step": 113
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 139.59375,
      "completions/mean_terminated_length": 139.59375,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "entropy": 0.17950539290905,
      "epoch": 1.3411764705882354,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4431358277797699,
      "learning_rate": 5.568627450980392e-07,
      "loss": 0.0,
      "num_tokens": 17778432.0,
      "reward": 0.7148804068565369,
      "reward_std": 0.19681406021118164,
      "rewards/rna_reward_fn/mean": 0.7148803472518921,
      "rewards/rna_reward_fn/std": 0.2995694577693939,
      "step": 114
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 482.0,
      "completions/max_terminated_length": 482.0,
      "completions/mean_length": 167.6875,
      "completions/mean_terminated_length": 167.6875,
      "completions/min_length": 47.0,
      "completions/min_terminated_length": 47.0,
      "entropy": 0.16394728422164917,
      "epoch": 1.3529411764705883,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4245275557041168,
      "learning_rate": 5.529411764705882e-07,
      "loss": -0.0,
      "num_tokens": 17951168.0,
      "reward": 0.6865168213844299,
      "reward_std": 0.21481367945671082,
      "rewards/rna_reward_fn/mean": 0.6865168213844299,
      "rewards/rna_reward_fn/std": 0.3217703402042389,
      "step": 115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 146.6875,
      "completions/mean_terminated_length": 146.6875,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "entropy": 0.16379400342702866,
      "epoch": 1.3647058823529412,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.7668678760528564,
      "learning_rate": 5.490196078431373e-07,
      "loss": -0.0,
      "num_tokens": 18102400.0,
      "reward": 0.7100426554679871,
      "reward_std": 0.20684288442134857,
      "rewards/rna_reward_fn/mean": 0.7100426554679871,
      "rewards/rna_reward_fn/std": 0.32808709144592285,
      "step": 116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 147.40625,
      "completions/mean_terminated_length": 147.40625,
      "completions/min_length": 43.0,
      "completions/min_terminated_length": 43.0,
      "entropy": 0.16369594633579254,
      "epoch": 1.3764705882352941,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4491204619407654,
      "learning_rate": 5.450980392156862e-07,
      "loss": -0.0,
      "num_tokens": 18254368.0,
      "reward": 0.6345921754837036,
      "reward_std": 0.17989099025726318,
      "rewards/rna_reward_fn/mean": 0.6345921754837036,
      "rewards/rna_reward_fn/std": 0.3739507794380188,
      "step": 117
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 128.96875,
      "completions/mean_terminated_length": 128.96875,
      "completions/min_length": 36.0,
      "completions/min_terminated_length": 36.0,
      "entropy": 0.16341928392648697,
      "epoch": 1.388235294117647,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.40218448638916016,
      "learning_rate": 5.411764705882353e-07,
      "loss": 0.0,
      "num_tokens": 18387456.0,
      "reward": 0.6973093748092651,
      "reward_std": 0.19106432795524597,
      "rewards/rna_reward_fn/mean": 0.6973093748092651,
      "rewards/rna_reward_fn/std": 0.328565388917923,
      "step": 118
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 185.3125,
      "completions/mean_terminated_length": 185.3125,
      "completions/min_length": 55.0,
      "completions/min_terminated_length": 55.0,
      "entropy": 0.15643662959337234,
      "epoch": 1.4,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4641011953353882,
      "learning_rate": 5.372549019607843e-07,
      "loss": -0.0,
      "num_tokens": 18578240.0,
      "reward": 0.6982426643371582,
      "reward_std": 0.17999790608882904,
      "rewards/rna_reward_fn/mean": 0.6982426643371582,
      "rewards/rna_reward_fn/std": 0.3187488615512848,
      "step": 119
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 500.0,
      "completions/max_terminated_length": 500.0,
      "completions/mean_length": 151.125,
      "completions/mean_terminated_length": 151.125,
      "completions/min_length": 51.0,
      "completions/min_terminated_length": 51.0,
      "entropy": 0.16167542338371277,
      "epoch": 1.4117647058823528,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4710671305656433,
      "learning_rate": 5.333333333333333e-07,
      "loss": -0.0,
      "num_tokens": 18734016.0,
      "reward": 0.765220046043396,
      "reward_std": 0.16310608386993408,
      "rewards/rna_reward_fn/mean": 0.765220046043396,
      "rewards/rna_reward_fn/std": 0.30073776841163635,
      "step": 120
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 200.53125,
      "completions/mean_terminated_length": 200.53125,
      "completions/min_length": 42.0,
      "completions/min_terminated_length": 42.0,
      "entropy": 0.17333289235830307,
      "epoch": 1.423529411764706,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5605267882347107,
      "learning_rate": 5.294117647058823e-07,
      "loss": -0.0,
      "num_tokens": 18940384.0,
      "reward": 0.6207563877105713,
      "reward_std": 0.2605891227722168,
      "rewards/rna_reward_fn/mean": 0.6207563877105713,
      "rewards/rna_reward_fn/std": 0.35733622312545776,
      "step": 121
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 378.0,
      "completions/max_terminated_length": 378.0,
      "completions/mean_length": 126.90625,
      "completions/mean_terminated_length": 126.90625,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "entropy": 0.16177111864089966,
      "epoch": 1.4352941176470588,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5492433905601501,
      "learning_rate": 5.254901960784313e-07,
      "loss": 0.0,
      "num_tokens": 19071360.0,
      "reward": 0.6156597137451172,
      "reward_std": 0.2084151953458786,
      "rewards/rna_reward_fn/mean": 0.6156597137451172,
      "rewards/rna_reward_fn/std": 0.3588009178638458,
      "step": 122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 389.0,
      "completions/max_terminated_length": 389.0,
      "completions/mean_length": 126.15625,
      "completions/mean_terminated_length": 126.15625,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "entropy": 0.1655115783214569,
      "epoch": 1.4470588235294117,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5015555024147034,
      "learning_rate": 5.215686274509804e-07,
      "loss": 0.0,
      "num_tokens": 19201568.0,
      "reward": 0.6790971755981445,
      "reward_std": 0.20820938050746918,
      "rewards/rna_reward_fn/mean": 0.6790972352027893,
      "rewards/rna_reward_fn/std": 0.33763545751571655,
      "step": 123
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 153.75,
      "completions/mean_terminated_length": 153.75,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "entropy": 0.1595897227525711,
      "epoch": 1.4588235294117646,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5314822793006897,
      "learning_rate": 5.176470588235294e-07,
      "loss": 0.0,
      "num_tokens": 19360032.0,
      "reward": 0.6510605812072754,
      "reward_std": 0.18497204780578613,
      "rewards/rna_reward_fn/mean": 0.6510605812072754,
      "rewards/rna_reward_fn/std": 0.3650972247123718,
      "step": 124
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 134.5625,
      "completions/mean_terminated_length": 134.5625,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "entropy": 0.1490706205368042,
      "epoch": 1.4705882352941178,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5578471422195435,
      "learning_rate": 5.137254901960784e-07,
      "loss": -0.0,
      "num_tokens": 19498848.0,
      "reward": 0.6481872797012329,
      "reward_std": 0.19116738438606262,
      "rewards/rna_reward_fn/mean": 0.6481872797012329,
      "rewards/rna_reward_fn/std": 0.32832634449005127,
      "step": 125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 186.0625,
      "completions/mean_terminated_length": 186.0625,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "entropy": 0.16315071284770966,
      "epoch": 1.4823529411764707,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.47001388669013977,
      "learning_rate": 5.098039215686274e-07,
      "loss": 0.0,
      "num_tokens": 19690400.0,
      "reward": 0.6869475245475769,
      "reward_std": 0.21966272592544556,
      "rewards/rna_reward_fn/mean": 0.6869475245475769,
      "rewards/rna_reward_fn/std": 0.3061429262161255,
      "step": 126
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 159.25,
      "completions/mean_terminated_length": 159.25,
      "completions/min_length": 36.0,
      "completions/min_terminated_length": 36.0,
      "entropy": 0.1544899046421051,
      "epoch": 1.4941176470588236,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.7163305878639221,
      "learning_rate": 5.058823529411765e-07,
      "loss": 0.0,
      "num_tokens": 19854496.0,
      "reward": 0.7104751467704773,
      "reward_std": 0.17693877220153809,
      "rewards/rna_reward_fn/mean": 0.7104751467704773,
      "rewards/rna_reward_fn/std": 0.30990538001060486,
      "step": 127
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 134.6875,
      "completions/mean_terminated_length": 134.6875,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "entropy": 0.16278471052646637,
      "epoch": 1.5058823529411764,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.7567697167396545,
      "learning_rate": 5.019607843137255e-07,
      "loss": -0.0,
      "num_tokens": 19993440.0,
      "reward": 0.6815826296806335,
      "reward_std": 0.20137576758861542,
      "rewards/rna_reward_fn/mean": 0.6815826296806335,
      "rewards/rna_reward_fn/std": 0.32526591420173645,
      "step": 128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 142.5625,
      "completions/mean_terminated_length": 142.5625,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "entropy": 0.16126833856105804,
      "epoch": 1.5176470588235293,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5958517789840698,
      "learning_rate": 4.980392156862744e-07,
      "loss": 0.0,
      "num_tokens": 20140448.0,
      "reward": 0.6496865153312683,
      "reward_std": 0.23397710919380188,
      "rewards/rna_reward_fn/mean": 0.6496865153312683,
      "rewards/rna_reward_fn/std": 0.3660079836845398,
      "step": 129
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 418.0,
      "completions/max_terminated_length": 418.0,
      "completions/mean_length": 178.3125,
      "completions/mean_terminated_length": 178.3125,
      "completions/min_length": 46.0,
      "completions/min_terminated_length": 46.0,
      "entropy": 0.16705547273159027,
      "epoch": 1.5294117647058822,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5045768618583679,
      "learning_rate": 4.941176470588235e-07,
      "loss": 0.0,
      "num_tokens": 20324064.0,
      "reward": 0.6084290146827698,
      "reward_std": 0.22301070392131805,
      "rewards/rna_reward_fn/mean": 0.608428955078125,
      "rewards/rna_reward_fn/std": 0.37412387132644653,
      "step": 130
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 178.21875,
      "completions/mean_terminated_length": 178.21875,
      "completions/min_length": 38.0,
      "completions/min_terminated_length": 38.0,
      "entropy": 0.16225259751081467,
      "epoch": 1.5411764705882351,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4790975749492645,
      "learning_rate": 4.901960784313725e-07,
      "loss": -0.0,
      "num_tokens": 20507584.0,
      "reward": 0.6834284067153931,
      "reward_std": 0.16327084600925446,
      "rewards/rna_reward_fn/mean": 0.6834284067153931,
      "rewards/rna_reward_fn/std": 0.3331601321697235,
      "step": 131
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 124.46875,
      "completions/mean_terminated_length": 124.46875,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "entropy": 0.14231518656015396,
      "epoch": 1.5529411764705883,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.45782116055488586,
      "learning_rate": 4.862745098039216e-07,
      "loss": -0.0,
      "num_tokens": 20636064.0,
      "reward": 0.6696175336837769,
      "reward_std": 0.1951877474784851,
      "rewards/rna_reward_fn/mean": 0.6696175336837769,
      "rewards/rna_reward_fn/std": 0.3469404876232147,
      "step": 132
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 153.09375,
      "completions/mean_terminated_length": 153.09375,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "entropy": 0.14148423075675964,
      "epoch": 1.5647058823529412,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.6516547203063965,
      "learning_rate": 4.823529411764705e-07,
      "loss": -0.0,
      "num_tokens": 20793856.0,
      "reward": 0.6711336374282837,
      "reward_std": 0.2223963439464569,
      "rewards/rna_reward_fn/mean": 0.6711336374282837,
      "rewards/rna_reward_fn/std": 0.3334668278694153,
      "step": 133
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 144.34375,
      "completions/mean_terminated_length": 144.34375,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "entropy": 0.1529795005917549,
      "epoch": 1.576470588235294,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5148042440414429,
      "learning_rate": 4.784313725490196e-07,
      "loss": 0.0,
      "num_tokens": 20942688.0,
      "reward": 0.759110152721405,
      "reward_std": 0.16160593926906586,
      "rewards/rna_reward_fn/mean": 0.7591102123260498,
      "rewards/rna_reward_fn/std": 0.2931617796421051,
      "step": 134
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 339.0,
      "completions/max_terminated_length": 339.0,
      "completions/mean_length": 108.34375,
      "completions/mean_terminated_length": 108.34375,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "entropy": 0.1443817839026451,
      "epoch": 1.5882352941176472,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.42829352617263794,
      "learning_rate": 4.7450980392156857e-07,
      "loss": -0.0,
      "num_tokens": 21054656.0,
      "reward": 0.6639102697372437,
      "reward_std": 0.20781482756137848,
      "rewards/rna_reward_fn/mean": 0.6639102697372437,
      "rewards/rna_reward_fn/std": 0.3437131941318512,
      "step": 135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 175.03125,
      "completions/mean_terminated_length": 175.03125,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "entropy": 0.15896137803792953,
      "epoch": 1.6,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5342750549316406,
      "learning_rate": 4.705882352941176e-07,
      "loss": 0.0,
      "num_tokens": 21234912.0,
      "reward": 0.6274444460868835,
      "reward_std": 0.22071924805641174,
      "rewards/rna_reward_fn/mean": 0.6274445056915283,
      "rewards/rna_reward_fn/std": 0.3473777174949646,
      "step": 136
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 398.0,
      "completions/max_terminated_length": 398.0,
      "completions/mean_length": 143.65625,
      "completions/mean_terminated_length": 143.65625,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "entropy": 0.15408551692962646,
      "epoch": 1.611764705882353,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.49438202381134033,
      "learning_rate": 4.6666666666666666e-07,
      "loss": 0.0,
      "num_tokens": 21383040.0,
      "reward": 0.6316537857055664,
      "reward_std": 0.1621330976486206,
      "rewards/rna_reward_fn/mean": 0.6316537857055664,
      "rewards/rna_reward_fn/std": 0.34947502613067627,
      "step": 137
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 392.0,
      "completions/max_terminated_length": 392.0,
      "completions/mean_length": 168.84375,
      "completions/mean_terminated_length": 168.84375,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "entropy": 0.17249725759029388,
      "epoch": 1.6235294117647059,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5168977379798889,
      "learning_rate": 4.627450980392157e-07,
      "loss": -0.0,
      "num_tokens": 21556960.0,
      "reward": 0.7472211122512817,
      "reward_std": 0.16369092464447021,
      "rewards/rna_reward_fn/mean": 0.7472211122512817,
      "rewards/rna_reward_fn/std": 0.27173811197280884,
      "step": 138
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 157.1875,
      "completions/mean_terminated_length": 157.1875,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "entropy": 0.16690535098314285,
      "epoch": 1.6352941176470588,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5558773875236511,
      "learning_rate": 4.5882352941176465e-07,
      "loss": 0.0,
      "num_tokens": 21718944.0,
      "reward": 0.6854004859924316,
      "reward_std": 0.19929495453834534,
      "rewards/rna_reward_fn/mean": 0.6854004859924316,
      "rewards/rna_reward_fn/std": 0.31646665930747986,
      "step": 139
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 152.59375,
      "completions/mean_terminated_length": 152.59375,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "entropy": 0.1484585627913475,
      "epoch": 1.6470588235294117,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.9384368062019348,
      "learning_rate": 4.549019607843137e-07,
      "loss": -0.0,
      "num_tokens": 21876224.0,
      "reward": 0.6835744380950928,
      "reward_std": 0.1949320137500763,
      "rewards/rna_reward_fn/mean": 0.6835744380950928,
      "rewards/rna_reward_fn/std": 0.35554417967796326,
      "step": 140
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 400.0,
      "completions/max_terminated_length": 400.0,
      "completions/mean_length": 127.875,
      "completions/mean_terminated_length": 127.875,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "entropy": 0.14056292921304703,
      "epoch": 1.6588235294117646,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4758838713169098,
      "learning_rate": 4.5098039215686274e-07,
      "loss": 0.0,
      "num_tokens": 22008192.0,
      "reward": 0.7035012245178223,
      "reward_std": 0.18292057514190674,
      "rewards/rna_reward_fn/mean": 0.703501284122467,
      "rewards/rna_reward_fn/std": 0.29926764965057373,
      "step": 141
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 164.59375,
      "completions/mean_terminated_length": 164.59375,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "entropy": 0.1475282907485962,
      "epoch": 1.6705882352941175,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5269675254821777,
      "learning_rate": 4.470588235294118e-07,
      "loss": -0.0,
      "num_tokens": 22177760.0,
      "reward": 0.724274754524231,
      "reward_std": 0.20411115884780884,
      "rewards/rna_reward_fn/mean": 0.724274754524231,
      "rewards/rna_reward_fn/std": 0.29461607336997986,
      "step": 142
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 166.09375,
      "completions/mean_terminated_length": 166.09375,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "entropy": 0.14830049872398376,
      "epoch": 1.6823529411764706,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5128397345542908,
      "learning_rate": 4.4313725490196073e-07,
      "loss": 0.0,
      "num_tokens": 22348864.0,
      "reward": 0.6864579916000366,
      "reward_std": 0.18042539060115814,
      "rewards/rna_reward_fn/mean": 0.6864579916000366,
      "rewards/rna_reward_fn/std": 0.3156171441078186,
      "step": 143
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 358.0,
      "completions/max_terminated_length": 358.0,
      "completions/mean_length": 121.21875,
      "completions/mean_terminated_length": 121.21875,
      "completions/min_length": 38.0,
      "completions/min_terminated_length": 38.0,
      "entropy": 0.14306584745645523,
      "epoch": 1.6941176470588235,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4526241421699524,
      "learning_rate": 4.392156862745098e-07,
      "loss": 0.0,
      "num_tokens": 22474016.0,
      "reward": 0.6906402111053467,
      "reward_std": 0.2201388031244278,
      "rewards/rna_reward_fn/mean": 0.6906402111053467,
      "rewards/rna_reward_fn/std": 0.3415301740169525,
      "step": 144
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 398.0,
      "completions/max_terminated_length": 398.0,
      "completions/mean_length": 111.0625,
      "completions/mean_terminated_length": 111.0625,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "entropy": 0.14087412506341934,
      "epoch": 1.7058823529411766,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4583019018173218,
      "learning_rate": 4.352941176470588e-07,
      "loss": 0.0,
      "num_tokens": 22588768.0,
      "reward": 0.7702864408493042,
      "reward_std": 0.1817162036895752,
      "rewards/rna_reward_fn/mean": 0.7702864408493042,
      "rewards/rna_reward_fn/std": 0.28576594591140747,
      "step": 145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 152.46875,
      "completions/mean_terminated_length": 152.46875,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "entropy": 0.13646821677684784,
      "epoch": 1.7176470588235295,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5821676850318909,
      "learning_rate": 4.313725490196078e-07,
      "loss": -0.0,
      "num_tokens": 22745920.0,
      "reward": 0.6735475659370422,
      "reward_std": 0.2079792022705078,
      "rewards/rna_reward_fn/mean": 0.6735475659370422,
      "rewards/rna_reward_fn/std": 0.34127116203308105,
      "step": 146
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 398.0,
      "completions/max_terminated_length": 398.0,
      "completions/mean_length": 137.0625,
      "completions/mean_terminated_length": 137.0625,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "entropy": 0.1294446587562561,
      "epoch": 1.7294117647058824,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.47053244709968567,
      "learning_rate": 4.274509803921568e-07,
      "loss": 0.0,
      "num_tokens": 22887296.0,
      "reward": 0.7310217618942261,
      "reward_std": 0.16372641921043396,
      "rewards/rna_reward_fn/mean": 0.7310217618942261,
      "rewards/rna_reward_fn/std": 0.29399389028549194,
      "step": 147
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 164.03125,
      "completions/mean_terminated_length": 164.03125,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "entropy": 0.16281016170978546,
      "epoch": 1.7411764705882353,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5588626861572266,
      "learning_rate": 4.2352941176470586e-07,
      "loss": 0.0,
      "num_tokens": 23056288.0,
      "reward": 0.654833197593689,
      "reward_std": 0.1884084939956665,
      "rewards/rna_reward_fn/mean": 0.654833197593689,
      "rewards/rna_reward_fn/std": 0.3517378270626068,
      "step": 148
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 140.84375,
      "completions/mean_terminated_length": 140.84375,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "entropy": 0.15908341854810715,
      "epoch": 1.7529411764705882,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5507121086120605,
      "learning_rate": 4.196078431372549e-07,
      "loss": 0.0,
      "num_tokens": 23201536.0,
      "reward": 0.699113667011261,
      "reward_std": 0.20187973976135254,
      "rewards/rna_reward_fn/mean": 0.699113667011261,
      "rewards/rna_reward_fn/std": 0.3249177634716034,
      "step": 149
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 400.0,
      "completions/max_terminated_length": 400.0,
      "completions/mean_length": 192.4375,
      "completions/mean_terminated_length": 192.4375,
      "completions/min_length": 40.0,
      "completions/min_terminated_length": 40.0,
      "entropy": 0.15749355405569077,
      "epoch": 1.7647058823529411,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.47758468985557556,
      "learning_rate": 4.156862745098039e-07,
      "loss": 0.0,
      "num_tokens": 23399616.0,
      "reward": 0.6602087020874023,
      "reward_std": 0.2426632046699524,
      "rewards/rna_reward_fn/mean": 0.6602087020874023,
      "rewards/rna_reward_fn/std": 0.3394790291786194,
      "step": 150
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 186.375,
      "completions/mean_terminated_length": 186.375,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "entropy": 0.1590714380145073,
      "epoch": 1.776470588235294,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5084402561187744,
      "learning_rate": 4.117647058823529e-07,
      "loss": 0.0,
      "num_tokens": 23591488.0,
      "reward": 0.6650402545928955,
      "reward_std": 0.18303653597831726,
      "rewards/rna_reward_fn/mean": 0.6650401949882507,
      "rewards/rna_reward_fn/std": 0.33965203166007996,
      "step": 151
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 141.40625,
      "completions/mean_terminated_length": 141.40625,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "entropy": 0.14213567227125168,
      "epoch": 1.788235294117647,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5413779020309448,
      "learning_rate": 4.0784313725490194e-07,
      "loss": -0.0,
      "num_tokens": 23737312.0,
      "reward": 0.6437839865684509,
      "reward_std": 0.2132418155670166,
      "rewards/rna_reward_fn/mean": 0.6437839865684509,
      "rewards/rna_reward_fn/std": 0.3476622402667999,
      "step": 152
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 140.75,
      "completions/mean_terminated_length": 140.75,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "entropy": 0.14729295670986176,
      "epoch": 1.8,
      "frac_reward_zero_std": 0.03125,
      "grad_norm": 0.48154816031455994,
      "learning_rate": 4.03921568627451e-07,
      "loss": -0.0,
      "num_tokens": 23882464.0,
      "reward": 0.6620033979415894,
      "reward_std": 0.22405345737934113,
      "rewards/rna_reward_fn/mean": 0.6620033979415894,
      "rewards/rna_reward_fn/std": 0.3390491306781769,
      "step": 153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 166.46875,
      "completions/mean_terminated_length": 166.46875,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "entropy": 0.14903101325035095,
      "epoch": 1.811764705882353,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.625751793384552,
      "learning_rate": 4e-07,
      "loss": -0.0,
      "num_tokens": 24053952.0,
      "reward": 0.6442551612854004,
      "reward_std": 0.17395520210266113,
      "rewards/rna_reward_fn/mean": 0.6442551612854004,
      "rewards/rna_reward_fn/std": 0.3670194745063782,
      "step": 154
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 157.90625,
      "completions/mean_terminated_length": 157.90625,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "entropy": 0.15323904901742935,
      "epoch": 1.8235294117647058,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.48200494050979614,
      "learning_rate": 3.96078431372549e-07,
      "loss": -0.0,
      "num_tokens": 24216672.0,
      "reward": 0.6359031200408936,
      "reward_std": 0.17717690765857697,
      "rewards/rna_reward_fn/mean": 0.6359031200408936,
      "rewards/rna_reward_fn/std": 0.32817214727401733,
      "step": 155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 472.0,
      "completions/max_terminated_length": 472.0,
      "completions/mean_length": 145.8125,
      "completions/mean_terminated_length": 145.8125,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "entropy": 0.1613752394914627,
      "epoch": 1.835294117647059,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.46832966804504395,
      "learning_rate": 3.92156862745098e-07,
      "loss": 0.0,
      "num_tokens": 24367008.0,
      "reward": 0.7130154371261597,
      "reward_std": 0.18193909525871277,
      "rewards/rna_reward_fn/mean": 0.7130154371261597,
      "rewards/rna_reward_fn/std": 0.3411928117275238,
      "step": 156
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 142.46875,
      "completions/mean_terminated_length": 142.46875,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "entropy": 0.13961906731128693,
      "epoch": 1.8470588235294119,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.6261844038963318,
      "learning_rate": 3.8823529411764707e-07,
      "loss": -0.0,
      "num_tokens": 24513920.0,
      "reward": 0.711245596408844,
      "reward_std": 0.1767653077840805,
      "rewards/rna_reward_fn/mean": 0.7112456560134888,
      "rewards/rna_reward_fn/std": 0.3348366618156433,
      "step": 157
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 412.0,
      "completions/max_terminated_length": 412.0,
      "completions/mean_length": 152.4375,
      "completions/mean_terminated_length": 152.4375,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "entropy": 0.1567898690700531,
      "epoch": 1.8588235294117648,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5200847387313843,
      "learning_rate": 3.8431372549019606e-07,
      "loss": -0.0,
      "num_tokens": 24671040.0,
      "reward": 0.7147434949874878,
      "reward_std": 0.14905846118927002,
      "rewards/rna_reward_fn/mean": 0.7147434949874878,
      "rewards/rna_reward_fn/std": 0.3070945739746094,
      "step": 158
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 392.0,
      "completions/max_terminated_length": 392.0,
      "completions/mean_length": 125.71875,
      "completions/mean_terminated_length": 125.71875,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "entropy": 0.133110411465168,
      "epoch": 1.8705882352941177,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4239906370639801,
      "learning_rate": 3.8039215686274506e-07,
      "loss": 0.0,
      "num_tokens": 24800800.0,
      "reward": 0.640139639377594,
      "reward_std": 0.20033451914787292,
      "rewards/rna_reward_fn/mean": 0.640139639377594,
      "rewards/rna_reward_fn/std": 0.3294910490512848,
      "step": 159
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 376.0,
      "completions/max_terminated_length": 376.0,
      "completions/mean_length": 134.8125,
      "completions/mean_terminated_length": 134.8125,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "entropy": 0.12187084183096886,
      "epoch": 1.8823529411764706,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.38697147369384766,
      "learning_rate": 3.764705882352941e-07,
      "loss": -0.0,
      "num_tokens": 24939872.0,
      "reward": 0.6659330725669861,
      "reward_std": 0.16438628733158112,
      "rewards/rna_reward_fn/mean": 0.6659330725669861,
      "rewards/rna_reward_fn/std": 0.35713815689086914,
      "step": 160
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 135.5625,
      "completions/mean_terminated_length": 135.5625,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "entropy": 0.13703680038452148,
      "epoch": 1.8941176470588235,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4564237594604492,
      "learning_rate": 3.7254901960784315e-07,
      "loss": 0.0,
      "num_tokens": 25079712.0,
      "reward": 0.6596216559410095,
      "reward_std": 0.20437049865722656,
      "rewards/rna_reward_fn/mean": 0.6596216559410095,
      "rewards/rna_reward_fn/std": 0.3517865240573883,
      "step": 161
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 177.0625,
      "completions/mean_terminated_length": 177.0625,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "entropy": 0.15036547183990479,
      "epoch": 1.9058823529411764,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.45348137617111206,
      "learning_rate": 3.6862745098039214e-07,
      "loss": -0.0,
      "num_tokens": 25262048.0,
      "reward": 0.6836435198783875,
      "reward_std": 0.20624709129333496,
      "rewards/rna_reward_fn/mean": 0.6836435198783875,
      "rewards/rna_reward_fn/std": 0.32797813415527344,
      "step": 162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 141.71875,
      "completions/mean_terminated_length": 141.71875,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "entropy": 0.14257021248340607,
      "epoch": 1.9176470588235293,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4581199586391449,
      "learning_rate": 3.6470588235294114e-07,
      "loss": -0.0,
      "num_tokens": 25408192.0,
      "reward": 0.6231480836868286,
      "reward_std": 0.20732316374778748,
      "rewards/rna_reward_fn/mean": 0.6231480836868286,
      "rewards/rna_reward_fn/std": 0.35448968410491943,
      "step": 163
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 389.0,
      "completions/max_terminated_length": 389.0,
      "completions/mean_length": 103.90625,
      "completions/mean_terminated_length": 103.90625,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "entropy": 0.11931119486689568,
      "epoch": 1.9294117647058824,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.42869991064071655,
      "learning_rate": 3.607843137254902e-07,
      "loss": -0.0,
      "num_tokens": 25515616.0,
      "reward": 0.7718137502670288,
      "reward_std": 0.15544265508651733,
      "rewards/rna_reward_fn/mean": 0.7718137502670288,
      "rewards/rna_reward_fn/std": 0.2820202112197876,
      "step": 164
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 399.0,
      "completions/max_terminated_length": 399.0,
      "completions/mean_length": 118.34375,
      "completions/mean_terminated_length": 118.34375,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "entropy": 0.13630840182304382,
      "epoch": 1.9411764705882353,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4796566069126129,
      "learning_rate": 3.5686274509803923e-07,
      "loss": 0.0,
      "num_tokens": 25637824.0,
      "reward": 0.7639800310134888,
      "reward_std": 0.16217514872550964,
      "rewards/rna_reward_fn/mean": 0.7639800310134888,
      "rewards/rna_reward_fn/std": 0.2800072729587555,
      "step": 165
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 492.0,
      "completions/max_terminated_length": 492.0,
      "completions/mean_length": 196.1875,
      "completions/mean_terminated_length": 196.1875,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "entropy": 0.1692701205611229,
      "epoch": 1.9529411764705882,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.576678991317749,
      "learning_rate": 3.529411764705882e-07,
      "loss": 0.0,
      "num_tokens": 25839744.0,
      "reward": 0.62703537940979,
      "reward_std": 0.24643635749816895,
      "rewards/rna_reward_fn/mean": 0.62703537940979,
      "rewards/rna_reward_fn/std": 0.3669246435165405,
      "step": 166
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 167.96875,
      "completions/mean_terminated_length": 167.96875,
      "completions/min_length": 32.0,
      "completions/min_terminated_length": 32.0,
      "entropy": 0.16024480760097504,
      "epoch": 1.9647058823529413,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.7311699390411377,
      "learning_rate": 3.490196078431372e-07,
      "loss": 0.0,
      "num_tokens": 26012768.0,
      "reward": 0.6588948369026184,
      "reward_std": 0.1576000452041626,
      "rewards/rna_reward_fn/mean": 0.6588948965072632,
      "rewards/rna_reward_fn/std": 0.32907265424728394,
      "step": 167
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 201.5,
      "completions/mean_terminated_length": 201.5,
      "completions/min_length": 48.0,
      "completions/min_terminated_length": 48.0,
      "entropy": 0.1511036530137062,
      "epoch": 1.9764705882352942,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4694945216178894,
      "learning_rate": 3.4509803921568627e-07,
      "loss": 0.0,
      "num_tokens": 26220128.0,
      "reward": 0.6976197957992554,
      "reward_std": 0.19369524717330933,
      "rewards/rna_reward_fn/mean": 0.6976197957992554,
      "rewards/rna_reward_fn/std": 0.32611048221588135,
      "step": 168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 400.0,
      "completions/max_terminated_length": 400.0,
      "completions/mean_length": 154.5,
      "completions/mean_terminated_length": 154.5,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "entropy": 0.15085221827030182,
      "epoch": 1.988235294117647,
      "frac_reward_zero_std": 0.03125,
      "grad_norm": 0.7034254670143127,
      "learning_rate": 3.411764705882353e-07,
      "loss": 0.0,
      "num_tokens": 26379360.0,
      "reward": 0.6942508220672607,
      "reward_std": 0.20178331434726715,
      "rewards/rna_reward_fn/mean": 0.6942508220672607,
      "rewards/rna_reward_fn/std": 0.31030499935150146,
      "step": 169
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 490.0,
      "completions/max_terminated_length": 490.0,
      "completions/mean_length": 160.53125,
      "completions/mean_terminated_length": 160.53125,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "entropy": 0.15548591315746307,
      "epoch": 2.0,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5434289574623108,
      "learning_rate": 3.372549019607843e-07,
      "loss": -0.0,
      "num_tokens": 26544768.0,
      "reward": 0.6601583957672119,
      "reward_std": 0.15550854802131653,
      "rewards/rna_reward_fn/mean": 0.6601583361625671,
      "rewards/rna_reward_fn/std": 0.3311554193496704,
      "step": 170
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 160.40625,
      "completions/mean_terminated_length": 160.40625,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "entropy": 0.1544594094157219,
      "epoch": 2.011764705882353,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.6815203428268433,
      "learning_rate": 3.333333333333333e-07,
      "loss": 0.0,
      "num_tokens": 26710048.0,
      "reward": 0.5972940921783447,
      "reward_std": 0.18555977940559387,
      "rewards/rna_reward_fn/mean": 0.5972940921783447,
      "rewards/rna_reward_fn/std": 0.36445632576942444,
      "step": 171
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 157.40625,
      "completions/mean_terminated_length": 157.40625,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "entropy": 0.14051128178834915,
      "epoch": 2.023529411764706,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5093562602996826,
      "learning_rate": 3.2941176470588235e-07,
      "loss": 0.0,
      "num_tokens": 26872256.0,
      "reward": 0.6649138927459717,
      "reward_std": 0.2001783400774002,
      "rewards/rna_reward_fn/mean": 0.6649138331413269,
      "rewards/rna_reward_fn/std": 0.3582386374473572,
      "step": 172
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 173.75,
      "completions/mean_terminated_length": 173.75,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "entropy": 0.14279819279909134,
      "epoch": 2.0352941176470587,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4454724192619324,
      "learning_rate": 3.2549019607843134e-07,
      "loss": -0.0,
      "num_tokens": 27051200.0,
      "reward": 0.7748029828071594,
      "reward_std": 0.14138856530189514,
      "rewards/rna_reward_fn/mean": 0.7748030424118042,
      "rewards/rna_reward_fn/std": 0.2777082026004791,
      "step": 173
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 165.8125,
      "completions/mean_terminated_length": 165.8125,
      "completions/min_length": 38.0,
      "completions/min_terminated_length": 38.0,
      "entropy": 0.13190212100744247,
      "epoch": 2.0470588235294116,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4604037404060364,
      "learning_rate": 3.215686274509804e-07,
      "loss": 0.0,
      "num_tokens": 27222016.0,
      "reward": 0.6792135238647461,
      "reward_std": 0.17050443589687347,
      "rewards/rna_reward_fn/mean": 0.6792135834693909,
      "rewards/rna_reward_fn/std": 0.3469991087913513,
      "step": 174
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 140.21875,
      "completions/mean_terminated_length": 140.21875,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "entropy": 0.11882514134049416,
      "epoch": 2.0588235294117645,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.42415928840637207,
      "learning_rate": 3.176470588235294e-07,
      "loss": -0.0,
      "num_tokens": 27366624.0,
      "reward": 0.618835985660553,
      "reward_std": 0.19730809330940247,
      "rewards/rna_reward_fn/mean": 0.6188360452651978,
      "rewards/rna_reward_fn/std": 0.3514353334903717,
      "step": 175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 154.25,
      "completions/mean_terminated_length": 154.25,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "entropy": 0.12727607041597366,
      "epoch": 2.070588235294118,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5830354690551758,
      "learning_rate": 3.1372549019607843e-07,
      "loss": 0.0,
      "num_tokens": 27525600.0,
      "reward": 0.6785444617271423,
      "reward_std": 0.18948182463645935,
      "rewards/rna_reward_fn/mean": 0.6785444617271423,
      "rewards/rna_reward_fn/std": 0.3351566791534424,
      "step": 176
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 147.78125,
      "completions/mean_terminated_length": 147.78125,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "entropy": 0.14719800651073456,
      "epoch": 2.0823529411764707,
      "frac_reward_zero_std": 0.03125,
      "grad_norm": 0.4794676899909973,
      "learning_rate": 3.098039215686274e-07,
      "loss": 0.0,
      "num_tokens": 27677952.0,
      "reward": 0.7077100276947021,
      "reward_std": 0.1931176781654358,
      "rewards/rna_reward_fn/mean": 0.7077100276947021,
      "rewards/rna_reward_fn/std": 0.3137640357017517,
      "step": 177
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 142.46875,
      "completions/mean_terminated_length": 142.46875,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "entropy": 0.15307611972093582,
      "epoch": 2.0941176470588236,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.6268736720085144,
      "learning_rate": 3.0588235294117647e-07,
      "loss": 0.0,
      "num_tokens": 27824864.0,
      "reward": 0.7079458236694336,
      "reward_std": 0.2219894826412201,
      "rewards/rna_reward_fn/mean": 0.7079458236694336,
      "rewards/rna_reward_fn/std": 0.3472329080104828,
      "step": 178
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 164.0,
      "completions/mean_terminated_length": 164.0,
      "completions/min_length": 36.0,
      "completions/min_terminated_length": 36.0,
      "entropy": 0.13749201595783234,
      "epoch": 2.1058823529411765,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5293802618980408,
      "learning_rate": 3.0196078431372546e-07,
      "loss": 0.0,
      "num_tokens": 27993824.0,
      "reward": 0.6385776996612549,
      "reward_std": 0.2456386685371399,
      "rewards/rna_reward_fn/mean": 0.6385776996612549,
      "rewards/rna_reward_fn/std": 0.36081886291503906,
      "step": 179
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 140.21875,
      "completions/mean_terminated_length": 140.21875,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "entropy": 0.1387496143579483,
      "epoch": 2.1176470588235294,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.538530707359314,
      "learning_rate": 2.980392156862745e-07,
      "loss": -0.0,
      "num_tokens": 28138432.0,
      "reward": 0.6739398241043091,
      "reward_std": 0.21720820665359497,
      "rewards/rna_reward_fn/mean": 0.6739398837089539,
      "rewards/rna_reward_fn/std": 0.30697187781333923,
      "step": 180
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 118.65625,
      "completions/mean_terminated_length": 118.65625,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "entropy": 0.11488081514835358,
      "epoch": 2.1294117647058823,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.42285630106925964,
      "learning_rate": 2.941176470588235e-07,
      "loss": -0.0,
      "num_tokens": 28260960.0,
      "reward": 0.7317262887954712,
      "reward_std": 0.20456328988075256,
      "rewards/rna_reward_fn/mean": 0.7317262887954712,
      "rewards/rna_reward_fn/std": 0.2935360074043274,
      "step": 181
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 384.0,
      "completions/max_terminated_length": 384.0,
      "completions/mean_length": 128.8125,
      "completions/mean_terminated_length": 128.8125,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "entropy": 0.13038966059684753,
      "epoch": 2.1411764705882352,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.43837785720825195,
      "learning_rate": 2.9019607843137255e-07,
      "loss": 0.0,
      "num_tokens": 28393888.0,
      "reward": 0.7334122657775879,
      "reward_std": 0.1874283403158188,
      "rewards/rna_reward_fn/mean": 0.7334122657775879,
      "rewards/rna_reward_fn/std": 0.3205217123031616,
      "step": 182
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 142.1875,
      "completions/mean_terminated_length": 142.1875,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "entropy": 0.142289437353611,
      "epoch": 2.152941176470588,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4689069092273712,
      "learning_rate": 2.8627450980392154e-07,
      "loss": -0.0,
      "num_tokens": 28540512.0,
      "reward": 0.738664448261261,
      "reward_std": 0.16794101893901825,
      "rewards/rna_reward_fn/mean": 0.7386645078659058,
      "rewards/rna_reward_fn/std": 0.30475351214408875,
      "step": 183
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 150.1875,
      "completions/mean_terminated_length": 150.1875,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "entropy": 0.13591318577528,
      "epoch": 2.164705882352941,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.48003292083740234,
      "learning_rate": 2.823529411764706e-07,
      "loss": -0.0,
      "num_tokens": 28695328.0,
      "reward": 0.6993162631988525,
      "reward_std": 0.1979941427707672,
      "rewards/rna_reward_fn/mean": 0.6993162035942078,
      "rewards/rna_reward_fn/std": 0.31292685866355896,
      "step": 184
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 472.0,
      "completions/max_terminated_length": 472.0,
      "completions/mean_length": 173.65625,
      "completions/mean_terminated_length": 173.65625,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "entropy": 0.15518562495708466,
      "epoch": 2.176470588235294,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.6343421339988708,
      "learning_rate": 2.784313725490196e-07,
      "loss": -0.0,
      "num_tokens": 28874176.0,
      "reward": 0.7311723232269287,
      "reward_std": 0.2127300500869751,
      "rewards/rna_reward_fn/mean": 0.7311723232269287,
      "rewards/rna_reward_fn/std": 0.3124001622200012,
      "step": 185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 137.5625,
      "completions/mean_terminated_length": 137.5625,
      "completions/min_length": 36.0,
      "completions/min_terminated_length": 36.0,
      "entropy": 0.1409146785736084,
      "epoch": 2.1882352941176473,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.46661409735679626,
      "learning_rate": 2.7450980392156863e-07,
      "loss": -0.0,
      "num_tokens": 29016064.0,
      "reward": 0.7118009328842163,
      "reward_std": 0.16496126353740692,
      "rewards/rna_reward_fn/mean": 0.7118009328842163,
      "rewards/rna_reward_fn/std": 0.32205572724342346,
      "step": 186
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 151.21875,
      "completions/mean_terminated_length": 151.21875,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "entropy": 0.14989649504423141,
      "epoch": 2.2,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.44188031554222107,
      "learning_rate": 2.705882352941176e-07,
      "loss": -0.0,
      "num_tokens": 29171936.0,
      "reward": 0.7327808141708374,
      "reward_std": 0.17523989081382751,
      "rewards/rna_reward_fn/mean": 0.7327808141708374,
      "rewards/rna_reward_fn/std": 0.32806655764579773,
      "step": 187
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 389.0,
      "completions/max_terminated_length": 389.0,
      "completions/mean_length": 157.84375,
      "completions/mean_terminated_length": 157.84375,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "entropy": 0.14322884380817413,
      "epoch": 2.211764705882353,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5148700475692749,
      "learning_rate": 2.6666666666666667e-07,
      "loss": -0.0,
      "num_tokens": 29334592.0,
      "reward": 0.6917252540588379,
      "reward_std": 0.17680642008781433,
      "rewards/rna_reward_fn/mean": 0.6917252540588379,
      "rewards/rna_reward_fn/std": 0.30800244212150574,
      "step": 188
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 164.0,
      "completions/mean_terminated_length": 164.0,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "entropy": 0.14842171967029572,
      "epoch": 2.223529411764706,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5274482369422913,
      "learning_rate": 2.6274509803921567e-07,
      "loss": 0.0,
      "num_tokens": 29503552.0,
      "reward": 0.7333264350891113,
      "reward_std": 0.17190617322921753,
      "rewards/rna_reward_fn/mean": 0.7333264350891113,
      "rewards/rna_reward_fn/std": 0.26974406838417053,
      "step": 189
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 167.875,
      "completions/mean_terminated_length": 167.875,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "entropy": 0.12728291004896164,
      "epoch": 2.235294117647059,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4334995746612549,
      "learning_rate": 2.588235294117647e-07,
      "loss": -0.0,
      "num_tokens": 29676480.0,
      "reward": 0.6551768779754639,
      "reward_std": 0.18493275344371796,
      "rewards/rna_reward_fn/mean": 0.6551768779754639,
      "rewards/rna_reward_fn/std": 0.33756914734840393,
      "step": 190
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 142.59375,
      "completions/mean_terminated_length": 142.59375,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "entropy": 0.13632921129465103,
      "epoch": 2.2470588235294118,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5202718377113342,
      "learning_rate": 2.549019607843137e-07,
      "loss": -0.0,
      "num_tokens": 29823520.0,
      "reward": 0.779222309589386,
      "reward_std": 0.1619720160961151,
      "rewards/rna_reward_fn/mean": 0.779222309589386,
      "rewards/rna_reward_fn/std": 0.255502849817276,
      "step": 191
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 381.0,
      "completions/max_terminated_length": 381.0,
      "completions/mean_length": 141.8125,
      "completions/mean_terminated_length": 141.8125,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "entropy": 0.1468304842710495,
      "epoch": 2.2588235294117647,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4959217309951782,
      "learning_rate": 2.5098039215686275e-07,
      "loss": 0.0,
      "num_tokens": 29969760.0,
      "reward": 0.6328116655349731,
      "reward_std": 0.20429277420043945,
      "rewards/rna_reward_fn/mean": 0.6328116655349731,
      "rewards/rna_reward_fn/std": 0.3653068244457245,
      "step": 192
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 472.0,
      "completions/max_terminated_length": 472.0,
      "completions/mean_length": 147.03125,
      "completions/mean_terminated_length": 147.03125,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "entropy": 0.14507943391799927,
      "epoch": 2.2705882352941176,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.46249526739120483,
      "learning_rate": 2.4705882352941175e-07,
      "loss": -0.0,
      "num_tokens": 30121344.0,
      "reward": 0.6946768760681152,
      "reward_std": 0.16386722028255463,
      "rewards/rna_reward_fn/mean": 0.6946768760681152,
      "rewards/rna_reward_fn/std": 0.3166311979293823,
      "step": 193
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 347.0,
      "completions/max_terminated_length": 347.0,
      "completions/mean_length": 119.1875,
      "completions/mean_terminated_length": 119.1875,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "entropy": 0.1289873719215393,
      "epoch": 2.2823529411764705,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.43832215666770935,
      "learning_rate": 2.431372549019608e-07,
      "loss": -0.0,
      "num_tokens": 30244416.0,
      "reward": 0.7309268116950989,
      "reward_std": 0.16351744532585144,
      "rewards/rna_reward_fn/mean": 0.7309267520904541,
      "rewards/rna_reward_fn/std": 0.27468279004096985,
      "step": 194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 132.40625,
      "completions/mean_terminated_length": 132.40625,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "entropy": 0.14909712970256805,
      "epoch": 2.2941176470588234,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4866437613964081,
      "learning_rate": 2.392156862745098e-07,
      "loss": -0.0,
      "num_tokens": 30381024.0,
      "reward": 0.6669021844863892,
      "reward_std": 0.19414769113063812,
      "rewards/rna_reward_fn/mean": 0.6669021844863892,
      "rewards/rna_reward_fn/std": 0.3391817808151245,
      "step": 195
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 174.0,
      "completions/mean_terminated_length": 174.0,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "entropy": 0.14798294007778168,
      "epoch": 2.3058823529411763,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.590640127658844,
      "learning_rate": 2.352941176470588e-07,
      "loss": -0.0,
      "num_tokens": 30560224.0,
      "reward": 0.6385676860809326,
      "reward_std": 0.20142759382724762,
      "rewards/rna_reward_fn/mean": 0.6385676860809326,
      "rewards/rna_reward_fn/std": 0.34272608160972595,
      "step": 196
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 125.125,
      "completions/mean_terminated_length": 125.125,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "entropy": 0.1469191089272499,
      "epoch": 2.317647058823529,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4721366763114929,
      "learning_rate": 2.3137254901960785e-07,
      "loss": -0.0,
      "num_tokens": 30689376.0,
      "reward": 0.7269188165664673,
      "reward_std": 0.19917072355747223,
      "rewards/rna_reward_fn/mean": 0.7269188165664673,
      "rewards/rna_reward_fn/std": 0.3235536217689514,
      "step": 197
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 170.21875,
      "completions/mean_terminated_length": 170.21875,
      "completions/min_length": 38.0,
      "completions/min_terminated_length": 38.0,
      "entropy": 0.1481616050004959,
      "epoch": 2.3294117647058825,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4824952483177185,
      "learning_rate": 2.2745098039215685e-07,
      "loss": 0.0,
      "num_tokens": 30864704.0,
      "reward": 0.7315170764923096,
      "reward_std": 0.19473856687545776,
      "rewards/rna_reward_fn/mean": 0.7315171360969543,
      "rewards/rna_reward_fn/std": 0.31163889169692993,
      "step": 198
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 124.21875,
      "completions/mean_terminated_length": 124.21875,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "entropy": 0.11309440433979034,
      "epoch": 2.3411764705882354,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.43292057514190674,
      "learning_rate": 2.235294117647059e-07,
      "loss": -0.0,
      "num_tokens": 30992928.0,
      "reward": 0.6969711184501648,
      "reward_std": 0.18462812900543213,
      "rewards/rna_reward_fn/mean": 0.6969711780548096,
      "rewards/rna_reward_fn/std": 0.30229660868644714,
      "step": 199
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 295.0,
      "completions/max_terminated_length": 295.0,
      "completions/mean_length": 115.625,
      "completions/mean_terminated_length": 115.625,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "entropy": 0.1170443557202816,
      "epoch": 2.3529411764705883,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.42780736088752747,
      "learning_rate": 2.196078431372549e-07,
      "loss": 0.0,
      "num_tokens": 31112352.0,
      "reward": 0.7397186160087585,
      "reward_std": 0.16325643658638,
      "rewards/rna_reward_fn/mean": 0.7397185564041138,
      "rewards/rna_reward_fn/std": 0.2868645191192627,
      "step": 200
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 191.78125,
      "completions/mean_terminated_length": 191.78125,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "entropy": 0.158894345164299,
      "epoch": 2.364705882352941,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5923020243644714,
      "learning_rate": 2.156862745098039e-07,
      "loss": 0.0,
      "num_tokens": 31309760.0,
      "reward": 0.713019609451294,
      "reward_std": 0.1600976586341858,
      "rewards/rna_reward_fn/mean": 0.7130196690559387,
      "rewards/rna_reward_fn/std": 0.3151859641075134,
      "step": 201
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 500.0,
      "completions/max_terminated_length": 500.0,
      "completions/mean_length": 167.15625,
      "completions/mean_terminated_length": 167.15625,
      "completions/min_length": 38.0,
      "completions/min_terminated_length": 38.0,
      "entropy": 0.15573827922344208,
      "epoch": 2.376470588235294,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5989984273910522,
      "learning_rate": 2.1176470588235293e-07,
      "loss": -0.0,
      "num_tokens": 31481952.0,
      "reward": 0.7245238423347473,
      "reward_std": 0.21510586142539978,
      "rewards/rna_reward_fn/mean": 0.7245238423347473,
      "rewards/rna_reward_fn/std": 0.3133554756641388,
      "step": 202
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 336.0,
      "completions/max_terminated_length": 336.0,
      "completions/mean_length": 147.15625,
      "completions/mean_terminated_length": 147.15625,
      "completions/min_length": 37.0,
      "completions/min_terminated_length": 37.0,
      "entropy": 0.14043358713388443,
      "epoch": 2.388235294117647,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.45242446660995483,
      "learning_rate": 2.0784313725490195e-07,
      "loss": 0.0,
      "num_tokens": 31633664.0,
      "reward": 0.6685344576835632,
      "reward_std": 0.19693541526794434,
      "rewards/rna_reward_fn/mean": 0.6685344576835632,
      "rewards/rna_reward_fn/std": 0.33878231048583984,
      "step": 203
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 160.78125,
      "completions/mean_terminated_length": 160.78125,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "entropy": 0.14151378720998764,
      "epoch": 2.4,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.578268826007843,
      "learning_rate": 2.0392156862745097e-07,
      "loss": 0.0,
      "num_tokens": 31799328.0,
      "reward": 0.753953218460083,
      "reward_std": 0.14072492718696594,
      "rewards/rna_reward_fn/mean": 0.753953218460083,
      "rewards/rna_reward_fn/std": 0.323638916015625,
      "step": 204
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 116.71875,
      "completions/mean_terminated_length": 116.71875,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "entropy": 0.14078038185834885,
      "epoch": 2.411764705882353,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5669292211532593,
      "learning_rate": 2e-07,
      "loss": 0.0,
      "num_tokens": 31919872.0,
      "reward": 0.7278470993041992,
      "reward_std": 0.18851059675216675,
      "rewards/rna_reward_fn/mean": 0.7278470993041992,
      "rewards/rna_reward_fn/std": 0.31520187854766846,
      "step": 205
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 165.1875,
      "completions/mean_terminated_length": 165.1875,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "entropy": 0.1560438796877861,
      "epoch": 2.4235294117647057,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5335204005241394,
      "learning_rate": 1.96078431372549e-07,
      "loss": -0.0,
      "num_tokens": 32090048.0,
      "reward": 0.74782395362854,
      "reward_std": 0.16413238644599915,
      "rewards/rna_reward_fn/mean": 0.74782395362854,
      "rewards/rna_reward_fn/std": 0.27966901659965515,
      "step": 206
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 129.75,
      "completions/mean_terminated_length": 129.75,
      "completions/min_length": 32.0,
      "completions/min_terminated_length": 32.0,
      "entropy": 0.13756585866212845,
      "epoch": 2.435294117647059,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4791547358036041,
      "learning_rate": 1.9215686274509803e-07,
      "loss": -0.0,
      "num_tokens": 32223936.0,
      "reward": 0.7443541884422302,
      "reward_std": 0.20347487926483154,
      "rewards/rna_reward_fn/mean": 0.744354248046875,
      "rewards/rna_reward_fn/std": 0.2934330999851227,
      "step": 207
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 144.46875,
      "completions/mean_terminated_length": 144.46875,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "entropy": 0.14090368151664734,
      "epoch": 2.447058823529412,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.48767152428627014,
      "learning_rate": 1.8823529411764705e-07,
      "loss": -0.0,
      "num_tokens": 32372896.0,
      "reward": 0.7094341516494751,
      "reward_std": 0.1646713763475418,
      "rewards/rna_reward_fn/mean": 0.7094341516494751,
      "rewards/rna_reward_fn/std": 0.31243574619293213,
      "step": 208
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 121.375,
      "completions/mean_terminated_length": 121.375,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "entropy": 0.13812856376171112,
      "epoch": 2.458823529411765,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.43114832043647766,
      "learning_rate": 1.8431372549019607e-07,
      "loss": -0.0,
      "num_tokens": 32498208.0,
      "reward": 0.7636112570762634,
      "reward_std": 0.1354459822177887,
      "rewards/rna_reward_fn/mean": 0.7636112570762634,
      "rewards/rna_reward_fn/std": 0.2837965786457062,
      "step": 209
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 157.9375,
      "completions/mean_terminated_length": 157.9375,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "entropy": 0.12325883284211159,
      "epoch": 2.4705882352941178,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.7042959928512573,
      "learning_rate": 1.803921568627451e-07,
      "loss": -0.0,
      "num_tokens": 32660960.0,
      "reward": 0.685276985168457,
      "reward_std": 0.14444154500961304,
      "rewards/rna_reward_fn/mean": 0.685276985168457,
      "rewards/rna_reward_fn/std": 0.3264351785182953,
      "step": 210
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 398.0,
      "completions/max_terminated_length": 398.0,
      "completions/mean_length": 149.28125,
      "completions/mean_terminated_length": 149.28125,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "entropy": 0.14060577005147934,
      "epoch": 2.4823529411764707,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.7576245665550232,
      "learning_rate": 1.764705882352941e-07,
      "loss": 0.0,
      "num_tokens": 32814848.0,
      "reward": 0.7403950691223145,
      "reward_std": 0.19349028170108795,
      "rewards/rna_reward_fn/mean": 0.7403950691223145,
      "rewards/rna_reward_fn/std": 0.31960996985435486,
      "step": 211
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 140.09375,
      "completions/mean_terminated_length": 140.09375,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "entropy": 0.128474622964859,
      "epoch": 2.4941176470588236,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4394446611404419,
      "learning_rate": 1.7254901960784313e-07,
      "loss": -0.0,
      "num_tokens": 32959328.0,
      "reward": 0.7468061447143555,
      "reward_std": 0.13857056200504303,
      "rewards/rna_reward_fn/mean": 0.7468062043190002,
      "rewards/rna_reward_fn/std": 0.2608503997325897,
      "step": 212
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 144.03125,
      "completions/mean_terminated_length": 144.03125,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "entropy": 0.14114519208669662,
      "epoch": 2.5058823529411764,
      "frac_reward_zero_std": 0.03125,
      "grad_norm": 0.5121099352836609,
      "learning_rate": 1.6862745098039215e-07,
      "loss": 0.0,
      "num_tokens": 33107840.0,
      "reward": 0.6896160244941711,
      "reward_std": 0.17474885284900665,
      "rewards/rna_reward_fn/mean": 0.6896160244941711,
      "rewards/rna_reward_fn/std": 0.30136245489120483,
      "step": 213
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 196.625,
      "completions/mean_terminated_length": 196.625,
      "completions/min_length": 37.0,
      "completions/min_terminated_length": 37.0,
      "entropy": 0.1554037183523178,
      "epoch": 2.5176470588235293,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5231500864028931,
      "learning_rate": 1.6470588235294117e-07,
      "loss": 0.0,
      "num_tokens": 33310208.0,
      "reward": 0.7346584796905518,
      "reward_std": 0.20079070329666138,
      "rewards/rna_reward_fn/mean": 0.7346584796905518,
      "rewards/rna_reward_fn/std": 0.30361971259117126,
      "step": 214
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 138.9375,
      "completions/mean_terminated_length": 138.9375,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "entropy": 0.12060126662254333,
      "epoch": 2.5294117647058822,
      "frac_reward_zero_std": 0.03125,
      "grad_norm": 0.45047426223754883,
      "learning_rate": 1.607843137254902e-07,
      "loss": 0.0,
      "num_tokens": 33453504.0,
      "reward": 0.768707275390625,
      "reward_std": 0.13694067299365997,
      "rewards/rna_reward_fn/mean": 0.7687073349952698,
      "rewards/rna_reward_fn/std": 0.27220436930656433,
      "step": 215
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 198.78125,
      "completions/mean_terminated_length": 198.78125,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "entropy": 0.1575038880109787,
      "epoch": 2.541176470588235,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5329861044883728,
      "learning_rate": 1.5686274509803921e-07,
      "loss": -0.0,
      "num_tokens": 33658080.0,
      "reward": 0.7541199922561646,
      "reward_std": 0.15449070930480957,
      "rewards/rna_reward_fn/mean": 0.7541199922561646,
      "rewards/rna_reward_fn/std": 0.2656092345714569,
      "step": 216
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 412.0,
      "completions/max_terminated_length": 412.0,
      "completions/mean_length": 158.34375,
      "completions/mean_terminated_length": 158.34375,
      "completions/min_length": 36.0,
      "completions/min_terminated_length": 36.0,
      "entropy": 0.15501223504543304,
      "epoch": 2.552941176470588,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.45992547273635864,
      "learning_rate": 1.5294117647058823e-07,
      "loss": 0.0,
      "num_tokens": 33821248.0,
      "reward": 0.7572486400604248,
      "reward_std": 0.15161246061325073,
      "rewards/rna_reward_fn/mean": 0.7572486400604248,
      "rewards/rna_reward_fn/std": 0.29167696833610535,
      "step": 217
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 169.625,
      "completions/mean_terminated_length": 169.625,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "entropy": 0.13358986377716064,
      "epoch": 2.564705882352941,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.8965858817100525,
      "learning_rate": 1.4901960784313725e-07,
      "loss": -0.0,
      "num_tokens": 33995968.0,
      "reward": 0.7292990684509277,
      "reward_std": 0.16865938901901245,
      "rewards/rna_reward_fn/mean": 0.7292990684509277,
      "rewards/rna_reward_fn/std": 0.30115416646003723,
      "step": 218
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 176.78125,
      "completions/mean_terminated_length": 176.78125,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "entropy": 0.13434413820505142,
      "epoch": 2.576470588235294,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.582165002822876,
      "learning_rate": 1.4509803921568628e-07,
      "loss": -0.0,
      "num_tokens": 34178016.0,
      "reward": 0.6599196195602417,
      "reward_std": 0.196761354804039,
      "rewards/rna_reward_fn/mean": 0.6599196791648865,
      "rewards/rna_reward_fn/std": 0.33999550342559814,
      "step": 219
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 156.1875,
      "completions/mean_terminated_length": 156.1875,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "entropy": 0.1357617899775505,
      "epoch": 2.588235294117647,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5189464688301086,
      "learning_rate": 1.411764705882353e-07,
      "loss": 0.0,
      "num_tokens": 34338976.0,
      "reward": 0.7549696564674377,
      "reward_std": 0.1326015144586563,
      "rewards/rna_reward_fn/mean": 0.7549696564674377,
      "rewards/rna_reward_fn/std": 0.2852962613105774,
      "step": 220
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 148.15625,
      "completions/mean_terminated_length": 148.15625,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "entropy": 0.15427181124687195,
      "epoch": 2.6,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.536194920539856,
      "learning_rate": 1.3725490196078432e-07,
      "loss": 0.0,
      "num_tokens": 34491712.0,
      "reward": 0.7131255865097046,
      "reward_std": 0.14100758731365204,
      "rewards/rna_reward_fn/mean": 0.7131255865097046,
      "rewards/rna_reward_fn/std": 0.31784212589263916,
      "step": 221
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 380.0,
      "completions/max_terminated_length": 380.0,
      "completions/mean_length": 145.1875,
      "completions/mean_terminated_length": 145.1875,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "entropy": 0.13709458708763123,
      "epoch": 2.611764705882353,
      "frac_reward_zero_std": 0.03125,
      "grad_norm": 0.5712235569953918,
      "learning_rate": 1.3333333333333334e-07,
      "loss": 0.0,
      "num_tokens": 34641408.0,
      "reward": 0.7191460132598877,
      "reward_std": 0.16943207383155823,
      "rewards/rna_reward_fn/mean": 0.7191460132598877,
      "rewards/rna_reward_fn/std": 0.3015574514865875,
      "step": 222
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 145.1875,
      "completions/mean_terminated_length": 145.1875,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "entropy": 0.13566020876169205,
      "epoch": 2.623529411764706,
      "frac_reward_zero_std": 0.03125,
      "grad_norm": 0.4192090630531311,
      "learning_rate": 1.2941176470588236e-07,
      "loss": 0.0,
      "num_tokens": 34791104.0,
      "reward": 0.7555572986602783,
      "reward_std": 0.16786056756973267,
      "rewards/rna_reward_fn/mean": 0.7555572986602783,
      "rewards/rna_reward_fn/std": 0.2797638177871704,
      "step": 223
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 165.09375,
      "completions/mean_terminated_length": 165.09375,
      "completions/min_length": 41.0,
      "completions/min_terminated_length": 41.0,
      "entropy": 0.12663453072309494,
      "epoch": 2.635294117647059,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.6057937145233154,
      "learning_rate": 1.2549019607843138e-07,
      "loss": -0.0,
      "num_tokens": 34961184.0,
      "reward": 0.6839346289634705,
      "reward_std": 0.19452279806137085,
      "rewards/rna_reward_fn/mean": 0.6839346289634705,
      "rewards/rna_reward_fn/std": 0.33146002888679504,
      "step": 224
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 167.65625,
      "completions/mean_terminated_length": 167.65625,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "entropy": 0.1426771581172943,
      "epoch": 2.6470588235294117,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4763612747192383,
      "learning_rate": 1.215686274509804e-07,
      "loss": 0.0,
      "num_tokens": 35133888.0,
      "reward": 0.6619032621383667,
      "reward_std": 0.17893120646476746,
      "rewards/rna_reward_fn/mean": 0.6619032621383667,
      "rewards/rna_reward_fn/std": 0.3283209800720215,
      "step": 225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 399.0,
      "completions/max_terminated_length": 399.0,
      "completions/mean_length": 149.9375,
      "completions/mean_terminated_length": 149.9375,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "entropy": 0.14778528362512589,
      "epoch": 2.6588235294117646,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4169410169124603,
      "learning_rate": 1.176470588235294e-07,
      "loss": -0.0,
      "num_tokens": 35288448.0,
      "reward": 0.6732456088066101,
      "reward_std": 0.16452832520008087,
      "rewards/rna_reward_fn/mean": 0.6732455492019653,
      "rewards/rna_reward_fn/std": 0.3249601721763611,
      "step": 226
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 145.09375,
      "completions/mean_terminated_length": 145.09375,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "entropy": 0.1449032723903656,
      "epoch": 2.6705882352941175,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.6590065360069275,
      "learning_rate": 1.1372549019607842e-07,
      "loss": -0.0,
      "num_tokens": 35438048.0,
      "reward": 0.7874460220336914,
      "reward_std": 0.12049897015094757,
      "rewards/rna_reward_fn/mean": 0.7874460220336914,
      "rewards/rna_reward_fn/std": 0.2661431133747101,
      "step": 227
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 151.75,
      "completions/mean_terminated_length": 151.75,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "entropy": 0.13789667189121246,
      "epoch": 2.682352941176471,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.501124918460846,
      "learning_rate": 1.0980392156862744e-07,
      "loss": -0.0,
      "num_tokens": 35594464.0,
      "reward": 0.76551353931427,
      "reward_std": 0.14058314263820648,
      "rewards/rna_reward_fn/mean": 0.7655135989189148,
      "rewards/rna_reward_fn/std": 0.2855876088142395,
      "step": 228
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 163.71875,
      "completions/mean_terminated_length": 163.71875,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "entropy": 0.14094559848308563,
      "epoch": 2.6941176470588237,
      "frac_reward_zero_std": 0.03125,
      "grad_norm": 0.736441433429718,
      "learning_rate": 1.0588235294117647e-07,
      "loss": 0.0,
      "num_tokens": 35763136.0,
      "reward": 0.6939565539360046,
      "reward_std": 0.16584208607673645,
      "rewards/rna_reward_fn/mean": 0.6939565539360046,
      "rewards/rna_reward_fn/std": 0.32086971402168274,
      "step": 229
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 139.78125,
      "completions/mean_terminated_length": 139.78125,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "entropy": 0.13419293239712715,
      "epoch": 2.7058823529411766,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.6264002919197083,
      "learning_rate": 1.0196078431372549e-07,
      "loss": -0.0,
      "num_tokens": 35907296.0,
      "reward": 0.7488532066345215,
      "reward_std": 0.1620199978351593,
      "rewards/rna_reward_fn/mean": 0.7488532066345215,
      "rewards/rna_reward_fn/std": 0.2980068624019623,
      "step": 230
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 397.0,
      "completions/max_terminated_length": 397.0,
      "completions/mean_length": 137.40625,
      "completions/mean_terminated_length": 137.40625,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "entropy": 0.13055864721536636,
      "epoch": 2.7176470588235295,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4814888536930084,
      "learning_rate": 9.80392156862745e-08,
      "loss": 0.0,
      "num_tokens": 36049024.0,
      "reward": 0.6655980348587036,
      "reward_std": 0.15648490190505981,
      "rewards/rna_reward_fn/mean": 0.6655980348587036,
      "rewards/rna_reward_fn/std": 0.35470837354660034,
      "step": 231
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 130.90625,
      "completions/mean_terminated_length": 130.90625,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "entropy": 0.12380100041627884,
      "epoch": 2.7294117647058824,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.583757221698761,
      "learning_rate": 9.411764705882353e-08,
      "loss": -0.0,
      "num_tokens": 36184096.0,
      "reward": 0.7524540424346924,
      "reward_std": 0.15423446893692017,
      "rewards/rna_reward_fn/mean": 0.7524540424346924,
      "rewards/rna_reward_fn/std": 0.28454405069351196,
      "step": 232
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 399.0,
      "completions/max_terminated_length": 399.0,
      "completions/mean_length": 145.34375,
      "completions/mean_terminated_length": 145.34375,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "entropy": 0.1325184628367424,
      "epoch": 2.7411764705882353,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4390006959438324,
      "learning_rate": 9.019607843137255e-08,
      "loss": -0.0,
      "num_tokens": 36333952.0,
      "reward": 0.7277975082397461,
      "reward_std": 0.19573622941970825,
      "rewards/rna_reward_fn/mean": 0.7277975082397461,
      "rewards/rna_reward_fn/std": 0.32145431637763977,
      "step": 233
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 168.125,
      "completions/mean_terminated_length": 168.125,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "entropy": 0.13657083362340927,
      "epoch": 2.7529411764705882,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.7681740522384644,
      "learning_rate": 8.627450980392157e-08,
      "loss": -0.0,
      "num_tokens": 36507136.0,
      "reward": 0.7168524265289307,
      "reward_std": 0.18613344430923462,
      "rewards/rna_reward_fn/mean": 0.7168524265289307,
      "rewards/rna_reward_fn/std": 0.3243979215621948,
      "step": 234
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 163.875,
      "completions/mean_terminated_length": 163.875,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "entropy": 0.14333349466323853,
      "epoch": 2.764705882352941,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5657479763031006,
      "learning_rate": 8.235294117647059e-08,
      "loss": 0.0,
      "num_tokens": 36675968.0,
      "reward": 0.725771427154541,
      "reward_std": 0.16519448161125183,
      "rewards/rna_reward_fn/mean": 0.725771427154541,
      "rewards/rna_reward_fn/std": 0.29766252636909485,
      "step": 235
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 418.0,
      "completions/max_terminated_length": 418.0,
      "completions/mean_length": 156.46875,
      "completions/mean_terminated_length": 156.46875,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "entropy": 0.1441263109445572,
      "epoch": 2.776470588235294,
      "frac_reward_zero_std": 0.03125,
      "grad_norm": 0.4572143256664276,
      "learning_rate": 7.843137254901961e-08,
      "loss": 0.0,
      "num_tokens": 36837216.0,
      "reward": 0.742597222328186,
      "reward_std": 0.16114118695259094,
      "rewards/rna_reward_fn/mean": 0.742597222328186,
      "rewards/rna_reward_fn/std": 0.29970842599868774,
      "step": 236
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 396.0,
      "completions/max_terminated_length": 396.0,
      "completions/mean_length": 158.1875,
      "completions/mean_terminated_length": 158.1875,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "entropy": 0.1409977823495865,
      "epoch": 2.788235294117647,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.42590776085853577,
      "learning_rate": 7.450980392156863e-08,
      "loss": -0.0,
      "num_tokens": 37000224.0,
      "reward": 0.7145720720291138,
      "reward_std": 0.164639413356781,
      "rewards/rna_reward_fn/mean": 0.7145720720291138,
      "rewards/rna_reward_fn/std": 0.3098330497741699,
      "step": 237
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 276.0,
      "completions/max_terminated_length": 276.0,
      "completions/mean_length": 107.21875,
      "completions/mean_terminated_length": 107.21875,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "entropy": 0.11754556372761726,
      "epoch": 2.8,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4764781892299652,
      "learning_rate": 7.058823529411765e-08,
      "loss": 0.0,
      "num_tokens": 37111040.0,
      "reward": 0.7425558567047119,
      "reward_std": 0.16547845304012299,
      "rewards/rna_reward_fn/mean": 0.7425558567047119,
      "rewards/rna_reward_fn/std": 0.3051395118236542,
      "step": 238
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 492.0,
      "completions/max_terminated_length": 492.0,
      "completions/mean_length": 172.84375,
      "completions/mean_terminated_length": 172.84375,
      "completions/min_length": 42.0,
      "completions/min_terminated_length": 42.0,
      "entropy": 0.14019257575273514,
      "epoch": 2.8117647058823527,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5157439708709717,
      "learning_rate": 6.666666666666667e-08,
      "loss": -0.0,
      "num_tokens": 37289056.0,
      "reward": 0.6816315650939941,
      "reward_std": 0.2366928905248642,
      "rewards/rna_reward_fn/mean": 0.6816315650939941,
      "rewards/rna_reward_fn/std": 0.326466828584671,
      "step": 239
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 164.15625,
      "completions/mean_terminated_length": 164.15625,
      "completions/min_length": 37.0,
      "completions/min_terminated_length": 37.0,
      "entropy": 0.1466379389166832,
      "epoch": 2.8235294117647056,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5139991044998169,
      "learning_rate": 6.274509803921569e-08,
      "loss": 0.0,
      "num_tokens": 37458176.0,
      "reward": 0.7532614469528198,
      "reward_std": 0.1603999137878418,
      "rewards/rna_reward_fn/mean": 0.7532614469528198,
      "rewards/rna_reward_fn/std": 0.31244710087776184,
      "step": 240
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 157.15625,
      "completions/mean_terminated_length": 157.15625,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "entropy": 0.12356984615325928,
      "epoch": 2.835294117647059,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.9720450043678284,
      "learning_rate": 5.88235294117647e-08,
      "loss": -0.0,
      "num_tokens": 37620128.0,
      "reward": 0.7346148490905762,
      "reward_std": 0.15429024398326874,
      "rewards/rna_reward_fn/mean": 0.7346148490905762,
      "rewards/rna_reward_fn/std": 0.31154975295066833,
      "step": 241
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 153.5,
      "completions/mean_terminated_length": 153.5,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "entropy": 0.1341606229543686,
      "epoch": 2.847058823529412,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5591171979904175,
      "learning_rate": 5.490196078431372e-08,
      "loss": -0.0,
      "num_tokens": 37778336.0,
      "reward": 0.7116289138793945,
      "reward_std": 0.21866443753242493,
      "rewards/rna_reward_fn/mean": 0.7116289138793945,
      "rewards/rna_reward_fn/std": 0.2980954051017761,
      "step": 242
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 482.0,
      "completions/max_terminated_length": 482.0,
      "completions/mean_length": 203.4375,
      "completions/mean_terminated_length": 203.4375,
      "completions/min_length": 40.0,
      "completions/min_terminated_length": 40.0,
      "entropy": 0.14845673739910126,
      "epoch": 2.8588235294117648,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5372319221496582,
      "learning_rate": 5.098039215686274e-08,
      "loss": 0.0,
      "num_tokens": 37987680.0,
      "reward": 0.7392944693565369,
      "reward_std": 0.19700977206230164,
      "rewards/rna_reward_fn/mean": 0.7392945289611816,
      "rewards/rna_reward_fn/std": 0.30940258502960205,
      "step": 243
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 139.6875,
      "completions/mean_terminated_length": 139.6875,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "entropy": 0.13047143816947937,
      "epoch": 2.8705882352941177,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5987316370010376,
      "learning_rate": 4.705882352941176e-08,
      "loss": -0.0,
      "num_tokens": 38131744.0,
      "reward": 0.6977779269218445,
      "reward_std": 0.2151854932308197,
      "rewards/rna_reward_fn/mean": 0.6977779269218445,
      "rewards/rna_reward_fn/std": 0.3459690511226654,
      "step": 244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 490.0,
      "completions/max_terminated_length": 490.0,
      "completions/mean_length": 148.40625,
      "completions/mean_terminated_length": 148.40625,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "entropy": 0.14810562878847122,
      "epoch": 2.8823529411764706,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.7430775165557861,
      "learning_rate": 4.313725490196078e-08,
      "loss": -0.0,
      "num_tokens": 38284736.0,
      "reward": 0.6900802254676819,
      "reward_std": 0.18723735213279724,
      "rewards/rna_reward_fn/mean": 0.6900802254676819,
      "rewards/rna_reward_fn/std": 0.3328934609889984,
      "step": 245
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 139.5625,
      "completions/mean_terminated_length": 139.5625,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "entropy": 0.12182106822729111,
      "epoch": 2.8941176470588235,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.49635204672813416,
      "learning_rate": 3.9215686274509804e-08,
      "loss": 0.0,
      "num_tokens": 38428672.0,
      "reward": 0.7072439193725586,
      "reward_std": 0.1840672791004181,
      "rewards/rna_reward_fn/mean": 0.7072439193725586,
      "rewards/rna_reward_fn/std": 0.3065541088581085,
      "step": 246
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 167.6875,
      "completions/mean_terminated_length": 167.6875,
      "completions/min_length": 36.0,
      "completions/min_terminated_length": 36.0,
      "entropy": 0.13815301656723022,
      "epoch": 2.9058823529411764,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.8550586104393005,
      "learning_rate": 3.5294117647058824e-08,
      "loss": -0.0,
      "num_tokens": 38601408.0,
      "reward": 0.7532185316085815,
      "reward_std": 0.1475568264722824,
      "rewards/rna_reward_fn/mean": 0.7532185316085815,
      "rewards/rna_reward_fn/std": 0.29489991068840027,
      "step": 247
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 340.0,
      "completions/max_terminated_length": 340.0,
      "completions/mean_length": 122.34375,
      "completions/mean_terminated_length": 122.34375,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "entropy": 0.12259503453969955,
      "epoch": 2.9176470588235293,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.44689512252807617,
      "learning_rate": 3.1372549019607844e-08,
      "loss": 0.0,
      "num_tokens": 38727712.0,
      "reward": 0.7440149784088135,
      "reward_std": 0.1674138307571411,
      "rewards/rna_reward_fn/mean": 0.7440149188041687,
      "rewards/rna_reward_fn/std": 0.3040436804294586,
      "step": 248
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 474.0,
      "completions/max_terminated_length": 474.0,
      "completions/mean_length": 192.0,
      "completions/mean_terminated_length": 192.0,
      "completions/min_length": 42.0,
      "completions/min_terminated_length": 42.0,
      "entropy": 0.1282111555337906,
      "epoch": 2.9294117647058826,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5679563879966736,
      "learning_rate": 2.745098039215686e-08,
      "loss": 0.0,
      "num_tokens": 38925344.0,
      "reward": 0.6850175857543945,
      "reward_std": 0.19530020654201508,
      "rewards/rna_reward_fn/mean": 0.6850175857543945,
      "rewards/rna_reward_fn/std": 0.33921393752098083,
      "step": 249
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 117.09375,
      "completions/mean_terminated_length": 117.09375,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "entropy": 0.12855321913957596,
      "epoch": 2.9411764705882355,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.505153238773346,
      "learning_rate": 2.352941176470588e-08,
      "loss": -0.0,
      "num_tokens": 39046272.0,
      "reward": 0.6269246339797974,
      "reward_std": 0.16829745471477509,
      "rewards/rna_reward_fn/mean": 0.6269246339797974,
      "rewards/rna_reward_fn/std": 0.33109787106513977,
      "step": 250
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 424.0,
      "completions/max_terminated_length": 424.0,
      "completions/mean_length": 121.0,
      "completions/mean_terminated_length": 121.0,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "entropy": 0.12059168517589569,
      "epoch": 2.9529411764705884,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4366406500339508,
      "learning_rate": 1.9607843137254902e-08,
      "loss": 0.0,
      "num_tokens": 39171200.0,
      "reward": 0.7053718566894531,
      "reward_std": 0.14770260453224182,
      "rewards/rna_reward_fn/mean": 0.7053717970848083,
      "rewards/rna_reward_fn/std": 0.3234374523162842,
      "step": 251
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 327.0,
      "completions/max_terminated_length": 327.0,
      "completions/mean_length": 132.1875,
      "completions/mean_terminated_length": 132.1875,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "entropy": 0.13018939644098282,
      "epoch": 2.9647058823529413,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.6731492280960083,
      "learning_rate": 1.5686274509803922e-08,
      "loss": 0.0,
      "num_tokens": 39307584.0,
      "reward": 0.7679715752601624,
      "reward_std": 0.17536047101020813,
      "rewards/rna_reward_fn/mean": 0.7679715156555176,
      "rewards/rna_reward_fn/std": 0.2801183760166168,
      "step": 252
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 145.40625,
      "completions/mean_terminated_length": 145.40625,
      "completions/min_length": 37.0,
      "completions/min_terminated_length": 37.0,
      "entropy": 0.10920717194676399,
      "epoch": 2.976470588235294,
      "frac_reward_zero_std": 0.03125,
      "grad_norm": 0.46245628595352173,
      "learning_rate": 1.176470588235294e-08,
      "loss": 0.0,
      "num_tokens": 39457504.0,
      "reward": 0.7559751272201538,
      "reward_std": 0.15144692361354828,
      "rewards/rna_reward_fn/mean": 0.7559751272201538,
      "rewards/rna_reward_fn/std": 0.3152746260166168,
      "step": 253
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 193.3125,
      "completions/mean_terminated_length": 193.3125,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "entropy": 0.15460387617349625,
      "epoch": 2.988235294117647,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.6124170422554016,
      "learning_rate": 7.843137254901961e-09,
      "loss": 0.0,
      "num_tokens": 39656480.0,
      "reward": 0.7068374752998352,
      "reward_std": 0.19490104913711548,
      "rewards/rna_reward_fn/mean": 0.7068374752998352,
      "rewards/rna_reward_fn/std": 0.310377836227417,
      "step": 254
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 401.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 149.5,
      "completions/mean_terminated_length": 149.5,
      "completions/min_length": 36.0,
      "completions/min_terminated_length": 36.0,
      "entropy": 0.1327020823955536,
      "epoch": 3.0,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5195903778076172,
      "learning_rate": 3.9215686274509805e-09,
      "loss": -0.0,
      "num_tokens": 39810592.0,
      "reward": 0.7493961453437805,
      "reward_std": 0.17497789859771729,
      "rewards/rna_reward_fn/mean": 0.7493961453437805,
      "rewards/rna_reward_fn/std": 0.31194695830345154,
      "step": 255
    }
  ],
  "logging_steps": 1.0,
  "max_steps": 255,
  "num_input_tokens_seen": 39810592,
  "num_train_epochs": 3,
  "save_steps": 100,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 256,
  "trial_name": null,
  "trial_params": null
}