LLM-RNA-Design-2025 / model /SL+RL /trainer_state.json
Milanmg's picture
Initial upload: model + data
482db35 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 255,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 378.0,
"completions/max_terminated_length": 378.0,
"completions/mean_length": 116.875,
"completions/mean_terminated_length": 116.875,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"entropy": 0.3960496634244919,
"epoch": 0.011764705882352941,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3658151626586914,
"learning_rate": 1e-06,
"loss": 0.0,
"num_tokens": 120704.0,
"reward": 0.42291906476020813,
"reward_std": 0.353160560131073,
"rewards/rna_reward_fn/mean": 0.42291906476020813,
"rewards/rna_reward_fn/std": 0.39480823278427124,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 145.34375,
"completions/mean_terminated_length": 145.34375,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"entropy": 0.3918581157922745,
"epoch": 0.023529411764705882,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3561055362224579,
"learning_rate": 9.96078431372549e-07,
"loss": 0.0,
"num_tokens": 270560.0,
"reward": 0.4679465889930725,
"reward_std": 0.304127037525177,
"rewards/rna_reward_fn/mean": 0.4679465889930725,
"rewards/rna_reward_fn/std": 0.37357842922210693,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 169.4375,
"completions/mean_terminated_length": 169.4375,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"entropy": 0.3528731167316437,
"epoch": 0.03529411764705882,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3573973476886749,
"learning_rate": 9.92156862745098e-07,
"loss": 0.0,
"num_tokens": 445088.0,
"reward": 0.4688035249710083,
"reward_std": 0.3215726613998413,
"rewards/rna_reward_fn/mean": 0.4688035249710083,
"rewards/rna_reward_fn/std": 0.3945569097995758,
"step": 3
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 164.53125,
"completions/mean_terminated_length": 164.53125,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"entropy": 0.3565346747636795,
"epoch": 0.047058823529411764,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.37075310945510864,
"learning_rate": 9.88235294117647e-07,
"loss": -0.0,
"num_tokens": 614592.0,
"reward": 0.5333437323570251,
"reward_std": 0.3202625513076782,
"rewards/rna_reward_fn/mean": 0.5333437323570251,
"rewards/rna_reward_fn/std": 0.3746815025806427,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 400.0,
"completions/max_terminated_length": 400.0,
"completions/mean_length": 103.3125,
"completions/mean_terminated_length": 103.3125,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"entropy": 0.35146908462047577,
"epoch": 0.058823529411764705,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.34449008107185364,
"learning_rate": 9.84313725490196e-07,
"loss": -0.0,
"num_tokens": 721408.0,
"reward": 0.5266900062561035,
"reward_std": 0.32159364223480225,
"rewards/rna_reward_fn/mean": 0.5266900062561035,
"rewards/rna_reward_fn/std": 0.3701845705509186,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 161.25,
"completions/mean_terminated_length": 161.25,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"entropy": 0.3309106081724167,
"epoch": 0.07058823529411765,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.35763484239578247,
"learning_rate": 9.80392156862745e-07,
"loss": -0.0,
"num_tokens": 887552.0,
"reward": 0.5357265472412109,
"reward_std": 0.2797412872314453,
"rewards/rna_reward_fn/mean": 0.5357265472412109,
"rewards/rna_reward_fn/std": 0.3577335476875305,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 151.375,
"completions/mean_terminated_length": 151.375,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"entropy": 0.34717176854610443,
"epoch": 0.08235294117647059,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3663802146911621,
"learning_rate": 9.76470588235294e-07,
"loss": -0.0,
"num_tokens": 1043584.0,
"reward": 0.547458291053772,
"reward_std": 0.2995288372039795,
"rewards/rna_reward_fn/mean": 0.547458291053772,
"rewards/rna_reward_fn/std": 0.3604092001914978,
"step": 7
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 482.0,
"completions/max_terminated_length": 482.0,
"completions/mean_length": 167.125,
"completions/mean_terminated_length": 167.125,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"entropy": 0.31340789794921875,
"epoch": 0.09411764705882353,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4071066081523895,
"learning_rate": 9.725490196078432e-07,
"loss": -0.0,
"num_tokens": 1215744.0,
"reward": 0.5176310539245605,
"reward_std": 0.3205966353416443,
"rewards/rna_reward_fn/mean": 0.5176310539245605,
"rewards/rna_reward_fn/std": 0.3642078638076782,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 151.59375,
"completions/mean_terminated_length": 151.59375,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"entropy": 0.305365189909935,
"epoch": 0.10588235294117647,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3989139795303345,
"learning_rate": 9.686274509803921e-07,
"loss": -0.0,
"num_tokens": 1372000.0,
"reward": 0.6008568406105042,
"reward_std": 0.30818045139312744,
"rewards/rna_reward_fn/mean": 0.6008569002151489,
"rewards/rna_reward_fn/std": 0.35290631651878357,
"step": 9
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 135.53125,
"completions/mean_terminated_length": 135.53125,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"entropy": 0.2962174266576767,
"epoch": 0.11764705882352941,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.44642144441604614,
"learning_rate": 9.64705882352941e-07,
"loss": 0.0,
"num_tokens": 1511808.0,
"reward": 0.540717601776123,
"reward_std": 0.3060719966888428,
"rewards/rna_reward_fn/mean": 0.540717601776123,
"rewards/rna_reward_fn/std": 0.36574023962020874,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 187.71875,
"completions/mean_terminated_length": 187.71875,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"entropy": 0.2934599667787552,
"epoch": 0.12941176470588237,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3814420700073242,
"learning_rate": 9.607843137254902e-07,
"loss": -0.0,
"num_tokens": 1705056.0,
"reward": 0.6084277629852295,
"reward_std": 0.3016743063926697,
"rewards/rna_reward_fn/mean": 0.6084277629852295,
"rewards/rna_reward_fn/std": 0.37008586525917053,
"step": 11
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 123.65625,
"completions/mean_terminated_length": 123.65625,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"entropy": 0.28613443672657013,
"epoch": 0.1411764705882353,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.34958702325820923,
"learning_rate": 9.568627450980392e-07,
"loss": 0.0,
"num_tokens": 1832704.0,
"reward": 0.6017879247665405,
"reward_std": 0.3006741404533386,
"rewards/rna_reward_fn/mean": 0.6017879247665405,
"rewards/rna_reward_fn/std": 0.35490649938583374,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 140.65625,
"completions/mean_terminated_length": 140.65625,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"entropy": 0.277506560087204,
"epoch": 0.15294117647058825,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5352854132652283,
"learning_rate": 9.529411764705881e-07,
"loss": 0.0,
"num_tokens": 1977760.0,
"reward": 0.571915328502655,
"reward_std": 0.2985040843486786,
"rewards/rna_reward_fn/mean": 0.5719153881072998,
"rewards/rna_reward_fn/std": 0.3767135441303253,
"step": 13
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 424.0,
"completions/max_terminated_length": 424.0,
"completions/mean_length": 154.03125,
"completions/mean_terminated_length": 154.03125,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"entropy": 0.2907712608575821,
"epoch": 0.16470588235294117,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.40935981273651123,
"learning_rate": 9.490196078431371e-07,
"loss": 0.0,
"num_tokens": 2136512.0,
"reward": 0.5937778353691101,
"reward_std": 0.270163893699646,
"rewards/rna_reward_fn/mean": 0.5937778353691101,
"rewards/rna_reward_fn/std": 0.3509018123149872,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 184.40625,
"completions/mean_terminated_length": 184.40625,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"entropy": 0.27846619486808777,
"epoch": 0.17647058823529413,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.41769424080848694,
"learning_rate": 9.450980392156862e-07,
"loss": 0.0,
"num_tokens": 2326368.0,
"reward": 0.6163018941879272,
"reward_std": 0.26538053154945374,
"rewards/rna_reward_fn/mean": 0.6163018941879272,
"rewards/rna_reward_fn/std": 0.3496814966201782,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 389.0,
"completions/max_terminated_length": 389.0,
"completions/mean_length": 117.84375,
"completions/mean_terminated_length": 117.84375,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"entropy": 0.2604786157608032,
"epoch": 0.18823529411764706,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3845226764678955,
"learning_rate": 9.411764705882352e-07,
"loss": 0.0,
"num_tokens": 2448064.0,
"reward": 0.5925071239471436,
"reward_std": 0.2943580150604248,
"rewards/rna_reward_fn/mean": 0.5925071239471436,
"rewards/rna_reward_fn/std": 0.3674796521663666,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 112.125,
"completions/mean_terminated_length": 112.125,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"entropy": 0.25712524354457855,
"epoch": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.39543959498405457,
"learning_rate": 9.372549019607843e-07,
"loss": -0.0,
"num_tokens": 2563904.0,
"reward": 0.5904660224914551,
"reward_std": 0.26803961396217346,
"rewards/rna_reward_fn/mean": 0.5904660224914551,
"rewards/rna_reward_fn/std": 0.3583122193813324,
"step": 17
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 138.40625,
"completions/mean_terminated_length": 138.40625,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"entropy": 0.27494488656520844,
"epoch": 0.21176470588235294,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.32191383838653564,
"learning_rate": 9.333333333333333e-07,
"loss": -0.0,
"num_tokens": 2706656.0,
"reward": 0.6467701196670532,
"reward_std": 0.2634694576263428,
"rewards/rna_reward_fn/mean": 0.6467701196670532,
"rewards/rna_reward_fn/std": 0.3313148319721222,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 137.6875,
"completions/mean_terminated_length": 137.6875,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"entropy": 0.260918065905571,
"epoch": 0.2235294117647059,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4905475378036499,
"learning_rate": 9.294117647058824e-07,
"loss": 0.0,
"num_tokens": 2848672.0,
"reward": 0.5871793031692505,
"reward_std": 0.25154006481170654,
"rewards/rna_reward_fn/mean": 0.5871793031692505,
"rewards/rna_reward_fn/std": 0.3587729334831238,
"step": 19
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 166.78125,
"completions/mean_terminated_length": 166.78125,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"entropy": 0.26801037788391113,
"epoch": 0.23529411764705882,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7330372929573059,
"learning_rate": 9.254901960784314e-07,
"loss": -0.0,
"num_tokens": 3020480.0,
"reward": 0.5460379123687744,
"reward_std": 0.27695512771606445,
"rewards/rna_reward_fn/mean": 0.5460379123687744,
"rewards/rna_reward_fn/std": 0.37495046854019165,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 142.6875,
"completions/mean_terminated_length": 142.6875,
"completions/min_length": 39.0,
"completions/min_terminated_length": 39.0,
"entropy": 0.26508544385433197,
"epoch": 0.24705882352941178,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4575193524360657,
"learning_rate": 9.215686274509803e-07,
"loss": 0.0,
"num_tokens": 3167616.0,
"reward": 0.6192805171012878,
"reward_std": 0.2736813426017761,
"rewards/rna_reward_fn/mean": 0.6192805171012878,
"rewards/rna_reward_fn/std": 0.3539046049118042,
"step": 21
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 154.25,
"completions/mean_terminated_length": 154.25,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"entropy": 0.25467583537101746,
"epoch": 0.25882352941176473,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.407436341047287,
"learning_rate": 9.176470588235293e-07,
"loss": 0.0,
"num_tokens": 3326592.0,
"reward": 0.5778753757476807,
"reward_std": 0.27449485659599304,
"rewards/rna_reward_fn/mean": 0.5778753757476807,
"rewards/rna_reward_fn/std": 0.3692671060562134,
"step": 22
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 376.0,
"completions/max_terminated_length": 376.0,
"completions/mean_length": 135.46875,
"completions/mean_terminated_length": 135.46875,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"entropy": 0.23743800073862076,
"epoch": 0.27058823529411763,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.36481839418411255,
"learning_rate": 9.137254901960783e-07,
"loss": -0.0,
"num_tokens": 3466336.0,
"reward": 0.6230462193489075,
"reward_std": 0.27385085821151733,
"rewards/rna_reward_fn/mean": 0.6230462193489075,
"rewards/rna_reward_fn/std": 0.35384857654571533,
"step": 23
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 399.0,
"completions/max_terminated_length": 399.0,
"completions/mean_length": 159.25,
"completions/mean_terminated_length": 159.25,
"completions/min_length": 39.0,
"completions/min_terminated_length": 39.0,
"entropy": 0.2592047303915024,
"epoch": 0.2823529411764706,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.40386804938316345,
"learning_rate": 9.098039215686274e-07,
"loss": -0.0,
"num_tokens": 3630432.0,
"reward": 0.587247908115387,
"reward_std": 0.26836222410202026,
"rewards/rna_reward_fn/mean": 0.587247908115387,
"rewards/rna_reward_fn/std": 0.3811717927455902,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 400.0,
"completions/max_terminated_length": 400.0,
"completions/mean_length": 152.375,
"completions/mean_terminated_length": 152.375,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"entropy": 0.23664871603250504,
"epoch": 0.29411764705882354,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.514864444732666,
"learning_rate": 9.058823529411764e-07,
"loss": -0.0,
"num_tokens": 3787488.0,
"reward": 0.6044737696647644,
"reward_std": 0.2556478679180145,
"rewards/rna_reward_fn/mean": 0.6044737696647644,
"rewards/rna_reward_fn/std": 0.3558889329433441,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 418.0,
"completions/max_terminated_length": 418.0,
"completions/mean_length": 140.5,
"completions/mean_terminated_length": 140.5,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"entropy": 0.2437874600291252,
"epoch": 0.3058823529411765,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4290100038051605,
"learning_rate": 9.019607843137255e-07,
"loss": -0.0,
"num_tokens": 3932384.0,
"reward": 0.583857536315918,
"reward_std": 0.2450568526983261,
"rewards/rna_reward_fn/mean": 0.583857536315918,
"rewards/rna_reward_fn/std": 0.3653680384159088,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 472.0,
"completions/max_terminated_length": 472.0,
"completions/mean_length": 164.8125,
"completions/mean_terminated_length": 164.8125,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"entropy": 0.24944818764925003,
"epoch": 0.3176470588235294,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.42284926772117615,
"learning_rate": 8.980392156862745e-07,
"loss": -0.0,
"num_tokens": 4102176.0,
"reward": 0.5925735235214233,
"reward_std": 0.2968187630176544,
"rewards/rna_reward_fn/mean": 0.5925735235214233,
"rewards/rna_reward_fn/std": 0.3608212471008301,
"step": 27
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 397.0,
"completions/max_terminated_length": 397.0,
"completions/mean_length": 146.1875,
"completions/mean_terminated_length": 146.1875,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"entropy": 0.22080854326486588,
"epoch": 0.32941176470588235,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4605961740016937,
"learning_rate": 8.941176470588236e-07,
"loss": 0.0,
"num_tokens": 4252896.0,
"reward": 0.5584173202514648,
"reward_std": 0.2890748083591461,
"rewards/rna_reward_fn/mean": 0.5584173202514648,
"rewards/rna_reward_fn/std": 0.3958645462989807,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 175.90625,
"completions/mean_terminated_length": 175.90625,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"entropy": 0.2321019321680069,
"epoch": 0.3411764705882353,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5582552552223206,
"learning_rate": 8.901960784313724e-07,
"loss": 0.0,
"num_tokens": 4434048.0,
"reward": 0.5966294407844543,
"reward_std": 0.2823025584220886,
"rewards/rna_reward_fn/mean": 0.5966294407844543,
"rewards/rna_reward_fn/std": 0.3560717701911926,
"step": 29
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 174.1875,
"completions/mean_terminated_length": 174.1875,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"entropy": 0.21510899811983109,
"epoch": 0.35294117647058826,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.49061208963394165,
"learning_rate": 8.862745098039215e-07,
"loss": -0.0,
"num_tokens": 4613440.0,
"reward": 0.5848400592803955,
"reward_std": 0.267974317073822,
"rewards/rna_reward_fn/mean": 0.5848400592803955,
"rewards/rna_reward_fn/std": 0.37775954604148865,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 490.0,
"completions/max_terminated_length": 490.0,
"completions/mean_length": 163.15625,
"completions/mean_terminated_length": 163.15625,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"entropy": 0.2507341653108597,
"epoch": 0.36470588235294116,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.603717029094696,
"learning_rate": 8.823529411764705e-07,
"loss": 0.0,
"num_tokens": 4781536.0,
"reward": 0.6572607159614563,
"reward_std": 0.2553848624229431,
"rewards/rna_reward_fn/mean": 0.6572607159614563,
"rewards/rna_reward_fn/std": 0.3443078398704529,
"step": 31
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 170.34375,
"completions/mean_terminated_length": 170.34375,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"entropy": 0.2254045456647873,
"epoch": 0.3764705882352941,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5129714608192444,
"learning_rate": 8.784313725490196e-07,
"loss": -0.0,
"num_tokens": 4956992.0,
"reward": 0.6237974762916565,
"reward_std": 0.2781754732131958,
"rewards/rna_reward_fn/mean": 0.6237974762916565,
"rewards/rna_reward_fn/std": 0.37038782238960266,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 500.0,
"completions/max_terminated_length": 500.0,
"completions/mean_length": 140.96875,
"completions/mean_terminated_length": 140.96875,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"entropy": 0.23444515466690063,
"epoch": 0.38823529411764707,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5718296766281128,
"learning_rate": 8.745098039215686e-07,
"loss": -0.0,
"num_tokens": 5102368.0,
"reward": 0.663845956325531,
"reward_std": 0.23731249570846558,
"rewards/rna_reward_fn/mean": 0.6638458967208862,
"rewards/rna_reward_fn/std": 0.3386061191558838,
"step": 33
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 426.0,
"completions/max_terminated_length": 426.0,
"completions/mean_length": 135.84375,
"completions/mean_terminated_length": 135.84375,
"completions/min_length": 52.0,
"completions/min_terminated_length": 52.0,
"entropy": 0.21551413834095,
"epoch": 0.4,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.48484402894973755,
"learning_rate": 8.705882352941177e-07,
"loss": 0.0,
"num_tokens": 5242496.0,
"reward": 0.5733575224876404,
"reward_std": 0.2985653281211853,
"rewards/rna_reward_fn/mean": 0.5733575224876404,
"rewards/rna_reward_fn/std": 0.3665997385978699,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 128.34375,
"completions/mean_terminated_length": 128.34375,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"entropy": 0.19232773780822754,
"epoch": 0.4117647058823529,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3821423351764679,
"learning_rate": 8.666666666666667e-07,
"loss": 0.0,
"num_tokens": 5374944.0,
"reward": 0.6459628939628601,
"reward_std": 0.27456825971603394,
"rewards/rna_reward_fn/mean": 0.6459628939628601,
"rewards/rna_reward_fn/std": 0.3492187559604645,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 117.28125,
"completions/mean_terminated_length": 117.28125,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"entropy": 0.2170068845152855,
"epoch": 0.4235294117647059,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.519378125667572,
"learning_rate": 8.627450980392156e-07,
"loss": -0.0,
"num_tokens": 5496064.0,
"reward": 0.6556386947631836,
"reward_std": 0.2442726194858551,
"rewards/rna_reward_fn/mean": 0.6556386947631836,
"rewards/rna_reward_fn/std": 0.3574485182762146,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 143.75,
"completions/mean_terminated_length": 143.75,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"entropy": 0.23470622301101685,
"epoch": 0.43529411764705883,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4268864691257477,
"learning_rate": 8.588235294117646e-07,
"loss": 0.0,
"num_tokens": 5644288.0,
"reward": 0.6998727917671204,
"reward_std": 0.2536011040210724,
"rewards/rna_reward_fn/mean": 0.6998728513717651,
"rewards/rna_reward_fn/std": 0.34483227133750916,
"step": 37
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 161.84375,
"completions/mean_terminated_length": 161.84375,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"entropy": 0.20661279559135437,
"epoch": 0.4470588235294118,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.49551478028297424,
"learning_rate": 8.549019607843136e-07,
"loss": 0.0,
"num_tokens": 5811040.0,
"reward": 0.60715651512146,
"reward_std": 0.2498263716697693,
"rewards/rna_reward_fn/mean": 0.60715651512146,
"rewards/rna_reward_fn/std": 0.3692743182182312,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 169.09375,
"completions/mean_terminated_length": 169.09375,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"entropy": 0.22686513513326645,
"epoch": 0.4588235294117647,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.539655864238739,
"learning_rate": 8.509803921568627e-07,
"loss": 0.0,
"num_tokens": 5985216.0,
"reward": 0.606254518032074,
"reward_std": 0.27362608909606934,
"rewards/rna_reward_fn/mean": 0.606254518032074,
"rewards/rna_reward_fn/std": 0.37834590673446655,
"step": 39
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 158.625,
"completions/mean_terminated_length": 158.625,
"completions/min_length": 43.0,
"completions/min_terminated_length": 43.0,
"entropy": 0.20522872358560562,
"epoch": 0.47058823529411764,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4427753686904907,
"learning_rate": 8.470588235294117e-07,
"loss": 0.0,
"num_tokens": 6148672.0,
"reward": 0.6244011521339417,
"reward_std": 0.2686484158039093,
"rewards/rna_reward_fn/mean": 0.6244011521339417,
"rewards/rna_reward_fn/std": 0.3721536099910736,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 165.5,
"completions/mean_terminated_length": 165.5,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"entropy": 0.22500251233577728,
"epoch": 0.4823529411764706,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8924270272254944,
"learning_rate": 8.431372549019608e-07,
"loss": 0.0,
"num_tokens": 6319168.0,
"reward": 0.5321128368377686,
"reward_std": 0.29077643156051636,
"rewards/rna_reward_fn/mean": 0.5321128368377686,
"rewards/rna_reward_fn/std": 0.3840348422527313,
"step": 41
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 160.90625,
"completions/mean_terminated_length": 160.90625,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"entropy": 0.23232445865869522,
"epoch": 0.49411764705882355,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4376697540283203,
"learning_rate": 8.392156862745098e-07,
"loss": 0.0,
"num_tokens": 6484960.0,
"reward": 0.6353960037231445,
"reward_std": 0.2474566251039505,
"rewards/rna_reward_fn/mean": 0.6353960037231445,
"rewards/rna_reward_fn/std": 0.3577839136123657,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 156.9375,
"completions/mean_terminated_length": 156.9375,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"entropy": 0.21899814903736115,
"epoch": 0.5058823529411764,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5491610765457153,
"learning_rate": 8.352941176470589e-07,
"loss": -0.0,
"num_tokens": 6646688.0,
"reward": 0.6090617775917053,
"reward_std": 0.2399156093597412,
"rewards/rna_reward_fn/mean": 0.6090618371963501,
"rewards/rna_reward_fn/std": 0.35401132702827454,
"step": 43
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 161.28125,
"completions/mean_terminated_length": 161.28125,
"completions/min_length": 43.0,
"completions/min_terminated_length": 43.0,
"entropy": 0.2018352746963501,
"epoch": 0.5176470588235295,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4728248715400696,
"learning_rate": 8.313725490196078e-07,
"loss": -0.0,
"num_tokens": 6812864.0,
"reward": 0.5414500832557678,
"reward_std": 0.257457435131073,
"rewards/rna_reward_fn/mean": 0.5414501428604126,
"rewards/rna_reward_fn/std": 0.37554678320884705,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 144.53125,
"completions/mean_terminated_length": 144.53125,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"entropy": 0.21590139716863632,
"epoch": 0.5294117647058824,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.45613518357276917,
"learning_rate": 8.274509803921567e-07,
"loss": 0.0,
"num_tokens": 6961888.0,
"reward": 0.5840362310409546,
"reward_std": 0.24920199811458588,
"rewards/rna_reward_fn/mean": 0.5840362310409546,
"rewards/rna_reward_fn/std": 0.3838988244533539,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 151.59375,
"completions/mean_terminated_length": 151.59375,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"entropy": 0.20446214824914932,
"epoch": 0.5411764705882353,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4725431799888611,
"learning_rate": 8.235294117647058e-07,
"loss": 0.0,
"num_tokens": 7118144.0,
"reward": 0.5587388277053833,
"reward_std": 0.25771480798721313,
"rewards/rna_reward_fn/mean": 0.5587388277053833,
"rewards/rna_reward_fn/std": 0.3881581127643585,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 472.0,
"completions/max_terminated_length": 472.0,
"completions/mean_length": 148.09375,
"completions/mean_terminated_length": 148.09375,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"entropy": 0.20715581625699997,
"epoch": 0.5529411764705883,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5775709748268127,
"learning_rate": 8.196078431372548e-07,
"loss": -0.0,
"num_tokens": 7270816.0,
"reward": 0.6535854935646057,
"reward_std": 0.23074793815612793,
"rewards/rna_reward_fn/mean": 0.6535854339599609,
"rewards/rna_reward_fn/std": 0.35560858249664307,
"step": 47
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 148.25,
"completions/mean_terminated_length": 148.25,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"entropy": 0.20631568133831024,
"epoch": 0.5647058823529412,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5872889161109924,
"learning_rate": 8.156862745098039e-07,
"loss": -0.0,
"num_tokens": 7423648.0,
"reward": 0.5795817375183105,
"reward_std": 0.26122066378593445,
"rewards/rna_reward_fn/mean": 0.5795817375183105,
"rewards/rna_reward_fn/std": 0.3758288025856018,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 362.0,
"completions/max_terminated_length": 362.0,
"completions/mean_length": 124.71875,
"completions/mean_terminated_length": 124.71875,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"entropy": 0.19562938064336777,
"epoch": 0.5764705882352941,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.450082391500473,
"learning_rate": 8.117647058823529e-07,
"loss": 0.0,
"num_tokens": 7552384.0,
"reward": 0.657599925994873,
"reward_std": 0.24575895071029663,
"rewards/rna_reward_fn/mean": 0.657599925994873,
"rewards/rna_reward_fn/std": 0.31881189346313477,
"step": 49
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 144.53125,
"completions/mean_terminated_length": 144.53125,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"entropy": 0.212866373360157,
"epoch": 0.5882352941176471,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4694586992263794,
"learning_rate": 8.07843137254902e-07,
"loss": -0.0,
"num_tokens": 7701408.0,
"reward": 0.5784563422203064,
"reward_std": 0.2643548846244812,
"rewards/rna_reward_fn/mean": 0.5784563422203064,
"rewards/rna_reward_fn/std": 0.3683941066265106,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 138.21875,
"completions/mean_terminated_length": 138.21875,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"entropy": 0.17988762259483337,
"epoch": 0.6,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.44588983058929443,
"learning_rate": 8.03921568627451e-07,
"loss": 0.0,
"num_tokens": 7843968.0,
"reward": 0.6563807725906372,
"reward_std": 0.2578202784061432,
"rewards/rna_reward_fn/mean": 0.6563807725906372,
"rewards/rna_reward_fn/std": 0.3404718339443207,
"step": 51
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 181.0,
"completions/mean_terminated_length": 181.0,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"entropy": 0.22444826364517212,
"epoch": 0.611764705882353,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.49978184700012207,
"learning_rate": 8e-07,
"loss": -0.0,
"num_tokens": 8030336.0,
"reward": 0.6426054239273071,
"reward_std": 0.2517712712287903,
"rewards/rna_reward_fn/mean": 0.6426054239273071,
"rewards/rna_reward_fn/std": 0.3629717528820038,
"step": 52
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 185.40625,
"completions/mean_terminated_length": 185.40625,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"entropy": 0.20722465217113495,
"epoch": 0.6235294117647059,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6321276426315308,
"learning_rate": 7.960784313725489e-07,
"loss": -0.0,
"num_tokens": 8221216.0,
"reward": 0.7105848789215088,
"reward_std": 0.23574814200401306,
"rewards/rna_reward_fn/mean": 0.7105848789215088,
"rewards/rna_reward_fn/std": 0.3385322690010071,
"step": 53
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 358.0,
"completions/max_terminated_length": 358.0,
"completions/mean_length": 148.125,
"completions/mean_terminated_length": 148.125,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"entropy": 0.19676074385643005,
"epoch": 0.6352941176470588,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.49395662546157837,
"learning_rate": 7.92156862745098e-07,
"loss": 0.0,
"num_tokens": 8373920.0,
"reward": 0.5770894885063171,
"reward_std": 0.2644929885864258,
"rewards/rna_reward_fn/mean": 0.5770894289016724,
"rewards/rna_reward_fn/std": 0.3790797293186188,
"step": 54
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 159.6875,
"completions/mean_terminated_length": 159.6875,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"entropy": 0.18705828487873077,
"epoch": 0.6470588235294118,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4197390079498291,
"learning_rate": 7.88235294117647e-07,
"loss": 0.0,
"num_tokens": 8538464.0,
"reward": 0.5764464139938354,
"reward_std": 0.21550722420215607,
"rewards/rna_reward_fn/mean": 0.5764464139938354,
"rewards/rna_reward_fn/std": 0.364503413438797,
"step": 55
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 146.125,
"completions/mean_terminated_length": 146.125,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"entropy": 0.21118487417697906,
"epoch": 0.6588235294117647,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.37146326899528503,
"learning_rate": 7.84313725490196e-07,
"loss": 0.0,
"num_tokens": 8689120.0,
"reward": 0.6104137897491455,
"reward_std": 0.23754771053791046,
"rewards/rna_reward_fn/mean": 0.6104137897491455,
"rewards/rna_reward_fn/std": 0.3665221333503723,
"step": 56
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 160.65625,
"completions/mean_terminated_length": 160.65625,
"completions/min_length": 41.0,
"completions/min_terminated_length": 41.0,
"entropy": 0.1945827156305313,
"epoch": 0.6705882352941176,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4072308838367462,
"learning_rate": 7.803921568627451e-07,
"loss": 0.0,
"num_tokens": 8854656.0,
"reward": 0.6713041067123413,
"reward_std": 0.2212895005941391,
"rewards/rna_reward_fn/mean": 0.6713041067123413,
"rewards/rna_reward_fn/std": 0.3392506539821625,
"step": 57
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 142.125,
"completions/mean_terminated_length": 142.125,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"entropy": 0.18257632106542587,
"epoch": 0.6823529411764706,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4942987561225891,
"learning_rate": 7.764705882352941e-07,
"loss": 0.0,
"num_tokens": 9001216.0,
"reward": 0.6629120707511902,
"reward_std": 0.22726097702980042,
"rewards/rna_reward_fn/mean": 0.6629120707511902,
"rewards/rna_reward_fn/std": 0.31348657608032227,
"step": 58
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 147.03125,
"completions/mean_terminated_length": 147.03125,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"entropy": 0.20158874243497849,
"epoch": 0.6941176470588235,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5187806487083435,
"learning_rate": 7.725490196078432e-07,
"loss": 0.0,
"num_tokens": 9152800.0,
"reward": 0.6476730108261108,
"reward_std": 0.24552714824676514,
"rewards/rna_reward_fn/mean": 0.6476730108261108,
"rewards/rna_reward_fn/std": 0.33643367886543274,
"step": 59
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 399.0,
"completions/max_terminated_length": 399.0,
"completions/mean_length": 159.4375,
"completions/mean_terminated_length": 159.4375,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"entropy": 0.18591003119945526,
"epoch": 0.7058823529411765,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.36044302582740784,
"learning_rate": 7.686274509803921e-07,
"loss": 0.0,
"num_tokens": 9317088.0,
"reward": 0.6832787394523621,
"reward_std": 0.22806429862976074,
"rewards/rna_reward_fn/mean": 0.6832787394523621,
"rewards/rna_reward_fn/std": 0.32348689436912537,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 474.0,
"completions/max_terminated_length": 474.0,
"completions/mean_length": 160.96875,
"completions/mean_terminated_length": 160.96875,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"entropy": 0.21002116054296494,
"epoch": 0.7176470588235294,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5378114581108093,
"learning_rate": 7.647058823529411e-07,
"loss": -0.0,
"num_tokens": 9482944.0,
"reward": 0.6531599760055542,
"reward_std": 0.22567519545555115,
"rewards/rna_reward_fn/mean": 0.653160035610199,
"rewards/rna_reward_fn/std": 0.33769848942756653,
"step": 61
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 348.0,
"completions/max_terminated_length": 348.0,
"completions/mean_length": 116.75,
"completions/mean_terminated_length": 116.75,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"entropy": 0.18150582909584045,
"epoch": 0.7294117647058823,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.39785146713256836,
"learning_rate": 7.607843137254901e-07,
"loss": -0.0,
"num_tokens": 9603520.0,
"reward": 0.565564751625061,
"reward_std": 0.2807776927947998,
"rewards/rna_reward_fn/mean": 0.565564751625061,
"rewards/rna_reward_fn/std": 0.38936248421669006,
"step": 62
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 391.0,
"completions/max_terminated_length": 391.0,
"completions/mean_length": 147.78125,
"completions/mean_terminated_length": 147.78125,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"entropy": 0.189855195581913,
"epoch": 0.7411764705882353,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4473720192909241,
"learning_rate": 7.568627450980392e-07,
"loss": -0.0,
"num_tokens": 9755872.0,
"reward": 0.6822654008865356,
"reward_std": 0.23419374227523804,
"rewards/rna_reward_fn/mean": 0.6822654008865356,
"rewards/rna_reward_fn/std": 0.32637539505958557,
"step": 63
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 171.28125,
"completions/mean_terminated_length": 171.28125,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"entropy": 0.19365741312503815,
"epoch": 0.7529411764705882,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5170744061470032,
"learning_rate": 7.529411764705882e-07,
"loss": -0.0,
"num_tokens": 9932288.0,
"reward": 0.6570923328399658,
"reward_std": 0.24268731474876404,
"rewards/rna_reward_fn/mean": 0.6570923328399658,
"rewards/rna_reward_fn/std": 0.3360862731933594,
"step": 64
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 400.0,
"completions/max_terminated_length": 400.0,
"completions/mean_length": 138.5625,
"completions/mean_terminated_length": 138.5625,
"completions/min_length": 50.0,
"completions/min_terminated_length": 50.0,
"entropy": 0.15700556337833405,
"epoch": 0.7647058823529411,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.669632077217102,
"learning_rate": 7.490196078431373e-07,
"loss": -0.0,
"num_tokens": 10075200.0,
"reward": 0.5884541273117065,
"reward_std": 0.25077739357948303,
"rewards/rna_reward_fn/mean": 0.5884541869163513,
"rewards/rna_reward_fn/std": 0.3707042634487152,
"step": 65
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 147.875,
"completions/mean_terminated_length": 147.875,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"entropy": 0.1868809014558792,
"epoch": 0.7764705882352941,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.496881365776062,
"learning_rate": 7.450980392156863e-07,
"loss": -0.0,
"num_tokens": 10227648.0,
"reward": 0.6189287900924683,
"reward_std": 0.23646032810211182,
"rewards/rna_reward_fn/mean": 0.6189287900924683,
"rewards/rna_reward_fn/std": 0.3614950180053711,
"step": 66
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 127.75,
"completions/mean_terminated_length": 127.75,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"entropy": 0.17434925585985184,
"epoch": 0.788235294117647,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5550652742385864,
"learning_rate": 7.411764705882352e-07,
"loss": 0.0,
"num_tokens": 10359488.0,
"reward": 0.5918734073638916,
"reward_std": 0.2727334499359131,
"rewards/rna_reward_fn/mean": 0.5918734073638916,
"rewards/rna_reward_fn/std": 0.35672324895858765,
"step": 67
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 151.96875,
"completions/mean_terminated_length": 151.96875,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"entropy": 0.17505493760108948,
"epoch": 0.8,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3833948075771332,
"learning_rate": 7.372549019607843e-07,
"loss": -0.0,
"num_tokens": 10516128.0,
"reward": 0.7000205516815186,
"reward_std": 0.23740704357624054,
"rewards/rna_reward_fn/mean": 0.7000205516815186,
"rewards/rna_reward_fn/std": 0.3234153985977173,
"step": 68
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 141.15625,
"completions/mean_terminated_length": 141.15625,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"entropy": 0.17628953605890274,
"epoch": 0.8117647058823529,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3673928678035736,
"learning_rate": 7.333333333333332e-07,
"loss": 0.0,
"num_tokens": 10661696.0,
"reward": 0.6538941860198975,
"reward_std": 0.19288064539432526,
"rewards/rna_reward_fn/mean": 0.6538941860198975,
"rewards/rna_reward_fn/std": 0.3515564203262329,
"step": 69
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 399.0,
"completions/max_terminated_length": 399.0,
"completions/mean_length": 195.53125,
"completions/mean_terminated_length": 195.53125,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"entropy": 0.18974752724170685,
"epoch": 0.8235294117647058,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.719358503818512,
"learning_rate": 7.294117647058823e-07,
"loss": -0.0,
"num_tokens": 10862944.0,
"reward": 0.5886421203613281,
"reward_std": 0.23114809393882751,
"rewards/rna_reward_fn/mean": 0.5886421203613281,
"rewards/rna_reward_fn/std": 0.36729925870895386,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 156.46875,
"completions/mean_terminated_length": 156.46875,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"entropy": 0.17211396992206573,
"epoch": 0.8352941176470589,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4566245377063751,
"learning_rate": 7.254901960784313e-07,
"loss": 0.0,
"num_tokens": 11024192.0,
"reward": 0.6206304430961609,
"reward_std": 0.20096182823181152,
"rewards/rna_reward_fn/mean": 0.6206304430961609,
"rewards/rna_reward_fn/std": 0.3349648714065552,
"step": 71
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 124.84375,
"completions/mean_terminated_length": 124.84375,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"entropy": 0.16766826063394547,
"epoch": 0.8470588235294118,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4164656698703766,
"learning_rate": 7.215686274509804e-07,
"loss": -0.0,
"num_tokens": 11153056.0,
"reward": 0.6351762413978577,
"reward_std": 0.2213377058506012,
"rewards/rna_reward_fn/mean": 0.6351762413978577,
"rewards/rna_reward_fn/std": 0.3493310809135437,
"step": 72
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 412.0,
"completions/max_terminated_length": 412.0,
"completions/mean_length": 129.65625,
"completions/mean_terminated_length": 129.65625,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"entropy": 0.16023673117160797,
"epoch": 0.8588235294117647,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.41499394178390503,
"learning_rate": 7.176470588235294e-07,
"loss": 0.0,
"num_tokens": 11286848.0,
"reward": 0.6752070784568787,
"reward_std": 0.24617840349674225,
"rewards/rna_reward_fn/mean": 0.6752070784568787,
"rewards/rna_reward_fn/std": 0.34732139110565186,
"step": 73
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 164.9375,
"completions/mean_terminated_length": 164.9375,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"entropy": 0.18363939225673676,
"epoch": 0.8705882352941177,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.45577237010002136,
"learning_rate": 7.137254901960785e-07,
"loss": -0.0,
"num_tokens": 11456768.0,
"reward": 0.5772933959960938,
"reward_std": 0.23847423493862152,
"rewards/rna_reward_fn/mean": 0.5772933959960938,
"rewards/rna_reward_fn/std": 0.3823261260986328,
"step": 74
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 492.0,
"completions/max_terminated_length": 492.0,
"completions/mean_length": 188.28125,
"completions/mean_terminated_length": 188.28125,
"completions/min_length": 55.0,
"completions/min_terminated_length": 55.0,
"entropy": 0.1838960349559784,
"epoch": 0.8823529411764706,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5237012505531311,
"learning_rate": 7.098039215686274e-07,
"loss": 0.0,
"num_tokens": 11650592.0,
"reward": 0.6181286573410034,
"reward_std": 0.2555590569972992,
"rewards/rna_reward_fn/mean": 0.6181286573410034,
"rewards/rna_reward_fn/std": 0.37019652128219604,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 182.1875,
"completions/mean_terminated_length": 182.1875,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"entropy": 0.1790659874677658,
"epoch": 0.8941176470588236,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4818723499774933,
"learning_rate": 7.058823529411765e-07,
"loss": -0.0,
"num_tokens": 11838176.0,
"reward": 0.578412652015686,
"reward_std": 0.22860457003116608,
"rewards/rna_reward_fn/mean": 0.578412652015686,
"rewards/rna_reward_fn/std": 0.35265785455703735,
"step": 76
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 200.21875,
"completions/mean_terminated_length": 200.21875,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"entropy": 0.18565233796834946,
"epoch": 0.9058823529411765,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7948216795921326,
"learning_rate": 7.019607843137254e-07,
"loss": 0.0,
"num_tokens": 12044224.0,
"reward": 0.6187993288040161,
"reward_std": 0.2622474431991577,
"rewards/rna_reward_fn/mean": 0.6187993288040161,
"rewards/rna_reward_fn/std": 0.326750248670578,
"step": 77
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 145.8125,
"completions/mean_terminated_length": 145.8125,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"entropy": 0.17154797911643982,
"epoch": 0.9176470588235294,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.47565799951553345,
"learning_rate": 6.980392156862744e-07,
"loss": -0.0,
"num_tokens": 12194560.0,
"reward": 0.5971746444702148,
"reward_std": 0.18512360751628876,
"rewards/rna_reward_fn/mean": 0.5971747040748596,
"rewards/rna_reward_fn/std": 0.3710518777370453,
"step": 78
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 373.0,
"completions/max_terminated_length": 373.0,
"completions/mean_length": 128.71875,
"completions/mean_terminated_length": 128.71875,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"entropy": 0.15196984261274338,
"epoch": 0.9294117647058824,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4943343698978424,
"learning_rate": 6.941176470588235e-07,
"loss": -0.0,
"num_tokens": 12327392.0,
"reward": 0.6471496820449829,
"reward_std": 0.22329822182655334,
"rewards/rna_reward_fn/mean": 0.6471496820449829,
"rewards/rna_reward_fn/std": 0.33536407351493835,
"step": 79
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 400.0,
"completions/max_terminated_length": 400.0,
"completions/mean_length": 137.84375,
"completions/mean_terminated_length": 137.84375,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"entropy": 0.16948848217725754,
"epoch": 0.9411764705882353,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4759957492351532,
"learning_rate": 6.901960784313725e-07,
"loss": -0.0,
"num_tokens": 12469568.0,
"reward": 0.659608006477356,
"reward_std": 0.18602336943149567,
"rewards/rna_reward_fn/mean": 0.659608006477356,
"rewards/rna_reward_fn/std": 0.3731914460659027,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 146.75,
"completions/mean_terminated_length": 146.75,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"entropy": 0.18501683324575424,
"epoch": 0.9529411764705882,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.43785735964775085,
"learning_rate": 6.862745098039216e-07,
"loss": 0.0,
"num_tokens": 12620864.0,
"reward": 0.620478630065918,
"reward_std": 0.22393935918807983,
"rewards/rna_reward_fn/mean": 0.620478630065918,
"rewards/rna_reward_fn/std": 0.35981276631355286,
"step": 81
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 150.1875,
"completions/mean_terminated_length": 150.1875,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"entropy": 0.1829531416296959,
"epoch": 0.9647058823529412,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4392038583755493,
"learning_rate": 6.823529411764706e-07,
"loss": 0.0,
"num_tokens": 12775680.0,
"reward": 0.6712214350700378,
"reward_std": 0.2174052894115448,
"rewards/rna_reward_fn/mean": 0.6712214946746826,
"rewards/rna_reward_fn/std": 0.3370954990386963,
"step": 82
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 391.0,
"completions/max_terminated_length": 391.0,
"completions/mean_length": 141.8125,
"completions/mean_terminated_length": 141.8125,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"entropy": 0.1686822921037674,
"epoch": 0.9764705882352941,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4484212398529053,
"learning_rate": 6.784313725490196e-07,
"loss": -0.0,
"num_tokens": 12921920.0,
"reward": 0.6464422345161438,
"reward_std": 0.2250806838274002,
"rewards/rna_reward_fn/mean": 0.6464422345161438,
"rewards/rna_reward_fn/std": 0.3622319996356964,
"step": 83
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 166.65625,
"completions/mean_terminated_length": 166.65625,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"entropy": 0.17645781487226486,
"epoch": 0.9882352941176471,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7668079137802124,
"learning_rate": 6.745098039215686e-07,
"loss": 0.0,
"num_tokens": 13093600.0,
"reward": 0.6832870244979858,
"reward_std": 0.25750601291656494,
"rewards/rna_reward_fn/mean": 0.6832869648933411,
"rewards/rna_reward_fn/std": 0.3430787920951843,
"step": 84
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 167.96875,
"completions/mean_terminated_length": 167.96875,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"entropy": 0.17668870836496353,
"epoch": 1.0,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.43097105622291565,
"learning_rate": 6.705882352941176e-07,
"loss": 0.0,
"num_tokens": 13266624.0,
"reward": 0.5539568662643433,
"reward_std": 0.22693298757076263,
"rewards/rna_reward_fn/mean": 0.5539568066596985,
"rewards/rna_reward_fn/std": 0.38347697257995605,
"step": 85
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 182.3125,
"completions/mean_terminated_length": 182.3125,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"entropy": 0.1827656850218773,
"epoch": 1.011764705882353,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5608375668525696,
"learning_rate": 6.666666666666666e-07,
"loss": 0.0,
"num_tokens": 13454336.0,
"reward": 0.7320628762245178,
"reward_std": 0.22256582975387573,
"rewards/rna_reward_fn/mean": 0.7320628762245178,
"rewards/rna_reward_fn/std": 0.30846187472343445,
"step": 86
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 140.625,
"completions/mean_terminated_length": 140.625,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"entropy": 0.18483393639326096,
"epoch": 1.0235294117647058,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4667685627937317,
"learning_rate": 6.627450980392156e-07,
"loss": 0.0,
"num_tokens": 13599360.0,
"reward": 0.6894385814666748,
"reward_std": 0.20523157715797424,
"rewards/rna_reward_fn/mean": 0.6894385814666748,
"rewards/rna_reward_fn/std": 0.3155847191810608,
"step": 87
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 150.46875,
"completions/mean_terminated_length": 150.46875,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"entropy": 0.16182925552129745,
"epoch": 1.035294117647059,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6056375503540039,
"learning_rate": 6.588235294117647e-07,
"loss": -0.0,
"num_tokens": 13754464.0,
"reward": 0.6177388429641724,
"reward_std": 0.24611341953277588,
"rewards/rna_reward_fn/mean": 0.6177388429641724,
"rewards/rna_reward_fn/std": 0.3494950830936432,
"step": 88
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 144.09375,
"completions/mean_terminated_length": 144.09375,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"entropy": 0.17024414986371994,
"epoch": 1.0470588235294118,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4357620179653168,
"learning_rate": 6.549019607843137e-07,
"loss": -0.0,
"num_tokens": 13903040.0,
"reward": 0.611262857913971,
"reward_std": 0.19428220391273499,
"rewards/rna_reward_fn/mean": 0.611262857913971,
"rewards/rna_reward_fn/std": 0.3793390393257141,
"step": 89
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 132.5625,
"completions/mean_terminated_length": 132.5625,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"entropy": 0.16757714748382568,
"epoch": 1.0588235294117647,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.440759539604187,
"learning_rate": 6.509803921568627e-07,
"loss": -0.0,
"num_tokens": 14039808.0,
"reward": 0.6882448196411133,
"reward_std": 0.19556942582130432,
"rewards/rna_reward_fn/mean": 0.6882448196411133,
"rewards/rna_reward_fn/std": 0.32508718967437744,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 143.78125,
"completions/mean_terminated_length": 143.78125,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"entropy": 0.1645500287413597,
"epoch": 1.0705882352941176,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5613058805465698,
"learning_rate": 6.470588235294117e-07,
"loss": 0.0,
"num_tokens": 14188064.0,
"reward": 0.6789584159851074,
"reward_std": 0.19199398159980774,
"rewards/rna_reward_fn/mean": 0.6789584159851074,
"rewards/rna_reward_fn/std": 0.3482169210910797,
"step": 91
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 399.0,
"completions/max_terminated_length": 399.0,
"completions/mean_length": 118.34375,
"completions/mean_terminated_length": 118.34375,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"entropy": 0.14176590740680695,
"epoch": 1.0823529411764705,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4092370867729187,
"learning_rate": 6.431372549019608e-07,
"loss": -0.0,
"num_tokens": 14310272.0,
"reward": 0.650740921497345,
"reward_std": 0.18103614449501038,
"rewards/rna_reward_fn/mean": 0.650740921497345,
"rewards/rna_reward_fn/std": 0.32734215259552,
"step": 92
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 154.3125,
"completions/mean_terminated_length": 154.3125,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"entropy": 0.176346056163311,
"epoch": 1.0941176470588236,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4459090232849121,
"learning_rate": 6.392156862745097e-07,
"loss": 0.0,
"num_tokens": 14469312.0,
"reward": 0.6732466816902161,
"reward_std": 0.22345304489135742,
"rewards/rna_reward_fn/mean": 0.6732466816902161,
"rewards/rna_reward_fn/std": 0.3369784951210022,
"step": 93
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 145.3125,
"completions/mean_terminated_length": 145.3125,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"entropy": 0.1685405969619751,
"epoch": 1.1058823529411765,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5043669939041138,
"learning_rate": 6.352941176470588e-07,
"loss": -0.0,
"num_tokens": 14619136.0,
"reward": 0.677271842956543,
"reward_std": 0.20296773314476013,
"rewards/rna_reward_fn/mean": 0.677271842956543,
"rewards/rna_reward_fn/std": 0.320669025182724,
"step": 94
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 170.1875,
"completions/mean_terminated_length": 170.1875,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"entropy": 0.18431222438812256,
"epoch": 1.1176470588235294,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6736860275268555,
"learning_rate": 6.313725490196078e-07,
"loss": -0.0,
"num_tokens": 14794432.0,
"reward": 0.6684234738349915,
"reward_std": 0.259125292301178,
"rewards/rna_reward_fn/mean": 0.6684235334396362,
"rewards/rna_reward_fn/std": 0.34210121631622314,
"step": 95
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 157.6875,
"completions/mean_terminated_length": 157.6875,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"entropy": 0.16836901009082794,
"epoch": 1.1294117647058823,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4569699764251709,
"learning_rate": 6.274509803921569e-07,
"loss": -0.0,
"num_tokens": 14956928.0,
"reward": 0.68538498878479,
"reward_std": 0.1874302327632904,
"rewards/rna_reward_fn/mean": 0.68538498878479,
"rewards/rna_reward_fn/std": 0.295845091342926,
"step": 96
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 393.0,
"completions/max_terminated_length": 393.0,
"completions/mean_length": 140.21875,
"completions/mean_terminated_length": 140.21875,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"entropy": 0.158738911151886,
"epoch": 1.1411764705882352,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4725809395313263,
"learning_rate": 6.235294117647059e-07,
"loss": 0.0,
"num_tokens": 15101536.0,
"reward": 0.6654532551765442,
"reward_std": 0.18864062428474426,
"rewards/rna_reward_fn/mean": 0.6654532551765442,
"rewards/rna_reward_fn/std": 0.3371845781803131,
"step": 97
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 162.8125,
"completions/mean_terminated_length": 162.8125,
"completions/min_length": 42.0,
"completions/min_terminated_length": 42.0,
"entropy": 0.17738928645849228,
"epoch": 1.1529411764705881,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5798309445381165,
"learning_rate": 6.196078431372548e-07,
"loss": -0.0,
"num_tokens": 15269280.0,
"reward": 0.7147358655929565,
"reward_std": 0.21203583478927612,
"rewards/rna_reward_fn/mean": 0.7147358655929565,
"rewards/rna_reward_fn/std": 0.33255505561828613,
"step": 98
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 168.03125,
"completions/mean_terminated_length": 168.03125,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"entropy": 0.17116892337799072,
"epoch": 1.1647058823529413,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5520632863044739,
"learning_rate": 6.156862745098039e-07,
"loss": -0.0,
"num_tokens": 15442368.0,
"reward": 0.6365219950675964,
"reward_std": 0.20218491554260254,
"rewards/rna_reward_fn/mean": 0.6365219950675964,
"rewards/rna_reward_fn/std": 0.35175827145576477,
"step": 99
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 397.0,
"completions/max_terminated_length": 397.0,
"completions/mean_length": 138.0,
"completions/mean_terminated_length": 138.0,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"entropy": 0.17306677252054214,
"epoch": 1.1764705882352942,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4389256238937378,
"learning_rate": 6.117647058823529e-07,
"loss": 0.0,
"num_tokens": 15584704.0,
"reward": 0.7388399839401245,
"reward_std": 0.16607630252838135,
"rewards/rna_reward_fn/mean": 0.7388399839401245,
"rewards/rna_reward_fn/std": 0.2576732635498047,
"step": 100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 392.0,
"completions/max_terminated_length": 392.0,
"completions/mean_length": 137.40625,
"completions/mean_terminated_length": 137.40625,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"entropy": 0.15397901087999344,
"epoch": 1.188235294117647,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5594757795333862,
"learning_rate": 6.078431372549019e-07,
"loss": -0.0,
"num_tokens": 15726432.0,
"reward": 0.7157045602798462,
"reward_std": 0.22128766775131226,
"rewards/rna_reward_fn/mean": 0.7157045602798462,
"rewards/rna_reward_fn/std": 0.2969537079334259,
"step": 101
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 400.0,
"completions/max_terminated_length": 400.0,
"completions/mean_length": 127.78125,
"completions/mean_terminated_length": 127.78125,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"entropy": 0.17225481569766998,
"epoch": 1.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.40622541308403015,
"learning_rate": 6.039215686274509e-07,
"loss": -0.0,
"num_tokens": 15858304.0,
"reward": 0.7043038010597229,
"reward_std": 0.22727924585342407,
"rewards/rna_reward_fn/mean": 0.7043038606643677,
"rewards/rna_reward_fn/std": 0.33978909254074097,
"step": 102
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 167.625,
"completions/mean_terminated_length": 167.625,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"entropy": 0.17464321851730347,
"epoch": 1.2117647058823529,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4603181779384613,
"learning_rate": 6e-07,
"loss": -0.0,
"num_tokens": 16030976.0,
"reward": 0.61054527759552,
"reward_std": 0.22179073095321655,
"rewards/rna_reward_fn/mean": 0.61054527759552,
"rewards/rna_reward_fn/std": 0.37210676074028015,
"step": 103
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 156.8125,
"completions/mean_terminated_length": 156.8125,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"entropy": 0.1658085659146309,
"epoch": 1.223529411764706,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4843849539756775,
"learning_rate": 5.96078431372549e-07,
"loss": -0.0,
"num_tokens": 16192576.0,
"reward": 0.6978532075881958,
"reward_std": 0.1981123685836792,
"rewards/rna_reward_fn/mean": 0.6978532671928406,
"rewards/rna_reward_fn/std": 0.3141247630119324,
"step": 104
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 181.1875,
"completions/mean_terminated_length": 181.1875,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"entropy": 0.16212371736764908,
"epoch": 1.2352941176470589,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5290284752845764,
"learning_rate": 5.921568627450981e-07,
"loss": 0.0,
"num_tokens": 16379136.0,
"reward": 0.6463083028793335,
"reward_std": 0.1896321177482605,
"rewards/rna_reward_fn/mean": 0.6463083028793335,
"rewards/rna_reward_fn/std": 0.36457034945487976,
"step": 105
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 282.0,
"completions/max_terminated_length": 282.0,
"completions/mean_length": 124.3125,
"completions/mean_terminated_length": 124.3125,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"entropy": 0.15162574499845505,
"epoch": 1.2470588235294118,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.47445422410964966,
"learning_rate": 5.88235294117647e-07,
"loss": 0.0,
"num_tokens": 16507456.0,
"reward": 0.672465980052948,
"reward_std": 0.20273976027965546,
"rewards/rna_reward_fn/mean": 0.6724659204483032,
"rewards/rna_reward_fn/std": 0.3352026343345642,
"step": 106
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 154.25,
"completions/mean_terminated_length": 154.25,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"entropy": 0.1651393622159958,
"epoch": 1.2588235294117647,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.48081472516059875,
"learning_rate": 5.843137254901961e-07,
"loss": -0.0,
"num_tokens": 16666432.0,
"reward": 0.6745295524597168,
"reward_std": 0.21466964483261108,
"rewards/rna_reward_fn/mean": 0.6745295524597168,
"rewards/rna_reward_fn/std": 0.3604423701763153,
"step": 107
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 176.34375,
"completions/mean_terminated_length": 176.34375,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"entropy": 0.16943742334842682,
"epoch": 1.2705882352941176,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4931647777557373,
"learning_rate": 5.803921568627451e-07,
"loss": 0.0,
"num_tokens": 16848032.0,
"reward": 0.6875256896018982,
"reward_std": 0.2435401976108551,
"rewards/rna_reward_fn/mean": 0.6875256896018982,
"rewards/rna_reward_fn/std": 0.3279384672641754,
"step": 108
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 158.09375,
"completions/mean_terminated_length": 158.09375,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"entropy": 0.17465446144342422,
"epoch": 1.2823529411764705,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5001822113990784,
"learning_rate": 5.76470588235294e-07,
"loss": 0.0,
"num_tokens": 17010944.0,
"reward": 0.6029446125030518,
"reward_std": 0.1757221221923828,
"rewards/rna_reward_fn/mean": 0.6029446125030518,
"rewards/rna_reward_fn/std": 0.35652756690979004,
"step": 109
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 426.0,
"completions/max_terminated_length": 426.0,
"completions/mean_length": 167.40625,
"completions/mean_terminated_length": 167.40625,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"entropy": 0.16541431099176407,
"epoch": 1.2941176470588236,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4689631760120392,
"learning_rate": 5.725490196078431e-07,
"loss": -0.0,
"num_tokens": 17183392.0,
"reward": 0.6704152226448059,
"reward_std": 0.20997245609760284,
"rewards/rna_reward_fn/mean": 0.6704152226448059,
"rewards/rna_reward_fn/std": 0.32471874356269836,
"step": 110
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 398.0,
"completions/max_terminated_length": 398.0,
"completions/mean_length": 141.71875,
"completions/mean_terminated_length": 141.71875,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"entropy": 0.1647869274020195,
"epoch": 1.3058823529411765,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5760033130645752,
"learning_rate": 5.686274509803921e-07,
"loss": -0.0,
"num_tokens": 17329536.0,
"reward": 0.6938682198524475,
"reward_std": 0.20044496655464172,
"rewards/rna_reward_fn/mean": 0.6938682198524475,
"rewards/rna_reward_fn/std": 0.32881274819374084,
"step": 111
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 115.96875,
"completions/mean_terminated_length": 115.96875,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"entropy": 0.1390109360218048,
"epoch": 1.3176470588235294,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5902699828147888,
"learning_rate": 5.647058823529412e-07,
"loss": 0.0,
"num_tokens": 17449312.0,
"reward": 0.651271402835846,
"reward_std": 0.17913030087947845,
"rewards/rna_reward_fn/mean": 0.651271402835846,
"rewards/rna_reward_fn/std": 0.3490009009838104,
"step": 112
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 472.0,
"completions/max_terminated_length": 472.0,
"completions/mean_length": 179.8125,
"completions/mean_terminated_length": 179.8125,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"entropy": 0.16215970367193222,
"epoch": 1.3294117647058823,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6261849403381348,
"learning_rate": 5.607843137254902e-07,
"loss": -0.0,
"num_tokens": 17634464.0,
"reward": 0.6400759220123291,
"reward_std": 0.2095731794834137,
"rewards/rna_reward_fn/mean": 0.6400759220123291,
"rewards/rna_reward_fn/std": 0.34743088483810425,
"step": 113
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 139.59375,
"completions/mean_terminated_length": 139.59375,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"entropy": 0.17950539290905,
"epoch": 1.3411764705882354,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4431358277797699,
"learning_rate": 5.568627450980392e-07,
"loss": 0.0,
"num_tokens": 17778432.0,
"reward": 0.7148804068565369,
"reward_std": 0.19681406021118164,
"rewards/rna_reward_fn/mean": 0.7148803472518921,
"rewards/rna_reward_fn/std": 0.2995694577693939,
"step": 114
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 482.0,
"completions/max_terminated_length": 482.0,
"completions/mean_length": 167.6875,
"completions/mean_terminated_length": 167.6875,
"completions/min_length": 47.0,
"completions/min_terminated_length": 47.0,
"entropy": 0.16394728422164917,
"epoch": 1.3529411764705883,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4245275557041168,
"learning_rate": 5.529411764705882e-07,
"loss": -0.0,
"num_tokens": 17951168.0,
"reward": 0.6865168213844299,
"reward_std": 0.21481367945671082,
"rewards/rna_reward_fn/mean": 0.6865168213844299,
"rewards/rna_reward_fn/std": 0.3217703402042389,
"step": 115
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 146.6875,
"completions/mean_terminated_length": 146.6875,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"entropy": 0.16379400342702866,
"epoch": 1.3647058823529412,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7668678760528564,
"learning_rate": 5.490196078431373e-07,
"loss": -0.0,
"num_tokens": 18102400.0,
"reward": 0.7100426554679871,
"reward_std": 0.20684288442134857,
"rewards/rna_reward_fn/mean": 0.7100426554679871,
"rewards/rna_reward_fn/std": 0.32808709144592285,
"step": 116
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 147.40625,
"completions/mean_terminated_length": 147.40625,
"completions/min_length": 43.0,
"completions/min_terminated_length": 43.0,
"entropy": 0.16369594633579254,
"epoch": 1.3764705882352941,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4491204619407654,
"learning_rate": 5.450980392156862e-07,
"loss": -0.0,
"num_tokens": 18254368.0,
"reward": 0.6345921754837036,
"reward_std": 0.17989099025726318,
"rewards/rna_reward_fn/mean": 0.6345921754837036,
"rewards/rna_reward_fn/std": 0.3739507794380188,
"step": 117
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 128.96875,
"completions/mean_terminated_length": 128.96875,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"entropy": 0.16341928392648697,
"epoch": 1.388235294117647,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.40218448638916016,
"learning_rate": 5.411764705882353e-07,
"loss": 0.0,
"num_tokens": 18387456.0,
"reward": 0.6973093748092651,
"reward_std": 0.19106432795524597,
"rewards/rna_reward_fn/mean": 0.6973093748092651,
"rewards/rna_reward_fn/std": 0.328565388917923,
"step": 118
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 185.3125,
"completions/mean_terminated_length": 185.3125,
"completions/min_length": 55.0,
"completions/min_terminated_length": 55.0,
"entropy": 0.15643662959337234,
"epoch": 1.4,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4641011953353882,
"learning_rate": 5.372549019607843e-07,
"loss": -0.0,
"num_tokens": 18578240.0,
"reward": 0.6982426643371582,
"reward_std": 0.17999790608882904,
"rewards/rna_reward_fn/mean": 0.6982426643371582,
"rewards/rna_reward_fn/std": 0.3187488615512848,
"step": 119
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 500.0,
"completions/max_terminated_length": 500.0,
"completions/mean_length": 151.125,
"completions/mean_terminated_length": 151.125,
"completions/min_length": 51.0,
"completions/min_terminated_length": 51.0,
"entropy": 0.16167542338371277,
"epoch": 1.4117647058823528,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4710671305656433,
"learning_rate": 5.333333333333333e-07,
"loss": -0.0,
"num_tokens": 18734016.0,
"reward": 0.765220046043396,
"reward_std": 0.16310608386993408,
"rewards/rna_reward_fn/mean": 0.765220046043396,
"rewards/rna_reward_fn/std": 0.30073776841163635,
"step": 120
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 200.53125,
"completions/mean_terminated_length": 200.53125,
"completions/min_length": 42.0,
"completions/min_terminated_length": 42.0,
"entropy": 0.17333289235830307,
"epoch": 1.423529411764706,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5605267882347107,
"learning_rate": 5.294117647058823e-07,
"loss": -0.0,
"num_tokens": 18940384.0,
"reward": 0.6207563877105713,
"reward_std": 0.2605891227722168,
"rewards/rna_reward_fn/mean": 0.6207563877105713,
"rewards/rna_reward_fn/std": 0.35733622312545776,
"step": 121
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 378.0,
"completions/max_terminated_length": 378.0,
"completions/mean_length": 126.90625,
"completions/mean_terminated_length": 126.90625,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"entropy": 0.16177111864089966,
"epoch": 1.4352941176470588,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5492433905601501,
"learning_rate": 5.254901960784313e-07,
"loss": 0.0,
"num_tokens": 19071360.0,
"reward": 0.6156597137451172,
"reward_std": 0.2084151953458786,
"rewards/rna_reward_fn/mean": 0.6156597137451172,
"rewards/rna_reward_fn/std": 0.3588009178638458,
"step": 122
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 389.0,
"completions/max_terminated_length": 389.0,
"completions/mean_length": 126.15625,
"completions/mean_terminated_length": 126.15625,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"entropy": 0.1655115783214569,
"epoch": 1.4470588235294117,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5015555024147034,
"learning_rate": 5.215686274509804e-07,
"loss": 0.0,
"num_tokens": 19201568.0,
"reward": 0.6790971755981445,
"reward_std": 0.20820938050746918,
"rewards/rna_reward_fn/mean": 0.6790972352027893,
"rewards/rna_reward_fn/std": 0.33763545751571655,
"step": 123
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 153.75,
"completions/mean_terminated_length": 153.75,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"entropy": 0.1595897227525711,
"epoch": 1.4588235294117646,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5314822793006897,
"learning_rate": 5.176470588235294e-07,
"loss": 0.0,
"num_tokens": 19360032.0,
"reward": 0.6510605812072754,
"reward_std": 0.18497204780578613,
"rewards/rna_reward_fn/mean": 0.6510605812072754,
"rewards/rna_reward_fn/std": 0.3650972247123718,
"step": 124
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 134.5625,
"completions/mean_terminated_length": 134.5625,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"entropy": 0.1490706205368042,
"epoch": 1.4705882352941178,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5578471422195435,
"learning_rate": 5.137254901960784e-07,
"loss": -0.0,
"num_tokens": 19498848.0,
"reward": 0.6481872797012329,
"reward_std": 0.19116738438606262,
"rewards/rna_reward_fn/mean": 0.6481872797012329,
"rewards/rna_reward_fn/std": 0.32832634449005127,
"step": 125
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 186.0625,
"completions/mean_terminated_length": 186.0625,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"entropy": 0.16315071284770966,
"epoch": 1.4823529411764707,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.47001388669013977,
"learning_rate": 5.098039215686274e-07,
"loss": 0.0,
"num_tokens": 19690400.0,
"reward": 0.6869475245475769,
"reward_std": 0.21966272592544556,
"rewards/rna_reward_fn/mean": 0.6869475245475769,
"rewards/rna_reward_fn/std": 0.3061429262161255,
"step": 126
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 159.25,
"completions/mean_terminated_length": 159.25,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"entropy": 0.1544899046421051,
"epoch": 1.4941176470588236,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7163305878639221,
"learning_rate": 5.058823529411765e-07,
"loss": 0.0,
"num_tokens": 19854496.0,
"reward": 0.7104751467704773,
"reward_std": 0.17693877220153809,
"rewards/rna_reward_fn/mean": 0.7104751467704773,
"rewards/rna_reward_fn/std": 0.30990538001060486,
"step": 127
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 134.6875,
"completions/mean_terminated_length": 134.6875,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"entropy": 0.16278471052646637,
"epoch": 1.5058823529411764,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7567697167396545,
"learning_rate": 5.019607843137255e-07,
"loss": -0.0,
"num_tokens": 19993440.0,
"reward": 0.6815826296806335,
"reward_std": 0.20137576758861542,
"rewards/rna_reward_fn/mean": 0.6815826296806335,
"rewards/rna_reward_fn/std": 0.32526591420173645,
"step": 128
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 142.5625,
"completions/mean_terminated_length": 142.5625,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"entropy": 0.16126833856105804,
"epoch": 1.5176470588235293,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5958517789840698,
"learning_rate": 4.980392156862744e-07,
"loss": 0.0,
"num_tokens": 20140448.0,
"reward": 0.6496865153312683,
"reward_std": 0.23397710919380188,
"rewards/rna_reward_fn/mean": 0.6496865153312683,
"rewards/rna_reward_fn/std": 0.3660079836845398,
"step": 129
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 418.0,
"completions/max_terminated_length": 418.0,
"completions/mean_length": 178.3125,
"completions/mean_terminated_length": 178.3125,
"completions/min_length": 46.0,
"completions/min_terminated_length": 46.0,
"entropy": 0.16705547273159027,
"epoch": 1.5294117647058822,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5045768618583679,
"learning_rate": 4.941176470588235e-07,
"loss": 0.0,
"num_tokens": 20324064.0,
"reward": 0.6084290146827698,
"reward_std": 0.22301070392131805,
"rewards/rna_reward_fn/mean": 0.608428955078125,
"rewards/rna_reward_fn/std": 0.37412387132644653,
"step": 130
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 178.21875,
"completions/mean_terminated_length": 178.21875,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"entropy": 0.16225259751081467,
"epoch": 1.5411764705882351,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4790975749492645,
"learning_rate": 4.901960784313725e-07,
"loss": -0.0,
"num_tokens": 20507584.0,
"reward": 0.6834284067153931,
"reward_std": 0.16327084600925446,
"rewards/rna_reward_fn/mean": 0.6834284067153931,
"rewards/rna_reward_fn/std": 0.3331601321697235,
"step": 131
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 124.46875,
"completions/mean_terminated_length": 124.46875,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"entropy": 0.14231518656015396,
"epoch": 1.5529411764705883,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.45782116055488586,
"learning_rate": 4.862745098039216e-07,
"loss": -0.0,
"num_tokens": 20636064.0,
"reward": 0.6696175336837769,
"reward_std": 0.1951877474784851,
"rewards/rna_reward_fn/mean": 0.6696175336837769,
"rewards/rna_reward_fn/std": 0.3469404876232147,
"step": 132
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 153.09375,
"completions/mean_terminated_length": 153.09375,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"entropy": 0.14148423075675964,
"epoch": 1.5647058823529412,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6516547203063965,
"learning_rate": 4.823529411764705e-07,
"loss": -0.0,
"num_tokens": 20793856.0,
"reward": 0.6711336374282837,
"reward_std": 0.2223963439464569,
"rewards/rna_reward_fn/mean": 0.6711336374282837,
"rewards/rna_reward_fn/std": 0.3334668278694153,
"step": 133
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 144.34375,
"completions/mean_terminated_length": 144.34375,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"entropy": 0.1529795005917549,
"epoch": 1.576470588235294,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5148042440414429,
"learning_rate": 4.784313725490196e-07,
"loss": 0.0,
"num_tokens": 20942688.0,
"reward": 0.759110152721405,
"reward_std": 0.16160593926906586,
"rewards/rna_reward_fn/mean": 0.7591102123260498,
"rewards/rna_reward_fn/std": 0.2931617796421051,
"step": 134
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 339.0,
"completions/max_terminated_length": 339.0,
"completions/mean_length": 108.34375,
"completions/mean_terminated_length": 108.34375,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"entropy": 0.1443817839026451,
"epoch": 1.5882352941176472,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.42829352617263794,
"learning_rate": 4.7450980392156857e-07,
"loss": -0.0,
"num_tokens": 21054656.0,
"reward": 0.6639102697372437,
"reward_std": 0.20781482756137848,
"rewards/rna_reward_fn/mean": 0.6639102697372437,
"rewards/rna_reward_fn/std": 0.3437131941318512,
"step": 135
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 175.03125,
"completions/mean_terminated_length": 175.03125,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"entropy": 0.15896137803792953,
"epoch": 1.6,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5342750549316406,
"learning_rate": 4.705882352941176e-07,
"loss": 0.0,
"num_tokens": 21234912.0,
"reward": 0.6274444460868835,
"reward_std": 0.22071924805641174,
"rewards/rna_reward_fn/mean": 0.6274445056915283,
"rewards/rna_reward_fn/std": 0.3473777174949646,
"step": 136
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 398.0,
"completions/max_terminated_length": 398.0,
"completions/mean_length": 143.65625,
"completions/mean_terminated_length": 143.65625,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"entropy": 0.15408551692962646,
"epoch": 1.611764705882353,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.49438202381134033,
"learning_rate": 4.6666666666666666e-07,
"loss": 0.0,
"num_tokens": 21383040.0,
"reward": 0.6316537857055664,
"reward_std": 0.1621330976486206,
"rewards/rna_reward_fn/mean": 0.6316537857055664,
"rewards/rna_reward_fn/std": 0.34947502613067627,
"step": 137
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 392.0,
"completions/max_terminated_length": 392.0,
"completions/mean_length": 168.84375,
"completions/mean_terminated_length": 168.84375,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"entropy": 0.17249725759029388,
"epoch": 1.6235294117647059,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5168977379798889,
"learning_rate": 4.627450980392157e-07,
"loss": -0.0,
"num_tokens": 21556960.0,
"reward": 0.7472211122512817,
"reward_std": 0.16369092464447021,
"rewards/rna_reward_fn/mean": 0.7472211122512817,
"rewards/rna_reward_fn/std": 0.27173811197280884,
"step": 138
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 157.1875,
"completions/mean_terminated_length": 157.1875,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"entropy": 0.16690535098314285,
"epoch": 1.6352941176470588,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5558773875236511,
"learning_rate": 4.5882352941176465e-07,
"loss": 0.0,
"num_tokens": 21718944.0,
"reward": 0.6854004859924316,
"reward_std": 0.19929495453834534,
"rewards/rna_reward_fn/mean": 0.6854004859924316,
"rewards/rna_reward_fn/std": 0.31646665930747986,
"step": 139
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 152.59375,
"completions/mean_terminated_length": 152.59375,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"entropy": 0.1484585627913475,
"epoch": 1.6470588235294117,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9384368062019348,
"learning_rate": 4.549019607843137e-07,
"loss": -0.0,
"num_tokens": 21876224.0,
"reward": 0.6835744380950928,
"reward_std": 0.1949320137500763,
"rewards/rna_reward_fn/mean": 0.6835744380950928,
"rewards/rna_reward_fn/std": 0.35554417967796326,
"step": 140
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 400.0,
"completions/max_terminated_length": 400.0,
"completions/mean_length": 127.875,
"completions/mean_terminated_length": 127.875,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"entropy": 0.14056292921304703,
"epoch": 1.6588235294117646,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4758838713169098,
"learning_rate": 4.5098039215686274e-07,
"loss": 0.0,
"num_tokens": 22008192.0,
"reward": 0.7035012245178223,
"reward_std": 0.18292057514190674,
"rewards/rna_reward_fn/mean": 0.703501284122467,
"rewards/rna_reward_fn/std": 0.29926764965057373,
"step": 141
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 164.59375,
"completions/mean_terminated_length": 164.59375,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"entropy": 0.1475282907485962,
"epoch": 1.6705882352941175,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5269675254821777,
"learning_rate": 4.470588235294118e-07,
"loss": -0.0,
"num_tokens": 22177760.0,
"reward": 0.724274754524231,
"reward_std": 0.20411115884780884,
"rewards/rna_reward_fn/mean": 0.724274754524231,
"rewards/rna_reward_fn/std": 0.29461607336997986,
"step": 142
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 166.09375,
"completions/mean_terminated_length": 166.09375,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"entropy": 0.14830049872398376,
"epoch": 1.6823529411764706,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5128397345542908,
"learning_rate": 4.4313725490196073e-07,
"loss": 0.0,
"num_tokens": 22348864.0,
"reward": 0.6864579916000366,
"reward_std": 0.18042539060115814,
"rewards/rna_reward_fn/mean": 0.6864579916000366,
"rewards/rna_reward_fn/std": 0.3156171441078186,
"step": 143
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 358.0,
"completions/max_terminated_length": 358.0,
"completions/mean_length": 121.21875,
"completions/mean_terminated_length": 121.21875,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"entropy": 0.14306584745645523,
"epoch": 1.6941176470588235,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4526241421699524,
"learning_rate": 4.392156862745098e-07,
"loss": 0.0,
"num_tokens": 22474016.0,
"reward": 0.6906402111053467,
"reward_std": 0.2201388031244278,
"rewards/rna_reward_fn/mean": 0.6906402111053467,
"rewards/rna_reward_fn/std": 0.3415301740169525,
"step": 144
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 398.0,
"completions/max_terminated_length": 398.0,
"completions/mean_length": 111.0625,
"completions/mean_terminated_length": 111.0625,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"entropy": 0.14087412506341934,
"epoch": 1.7058823529411766,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4583019018173218,
"learning_rate": 4.352941176470588e-07,
"loss": 0.0,
"num_tokens": 22588768.0,
"reward": 0.7702864408493042,
"reward_std": 0.1817162036895752,
"rewards/rna_reward_fn/mean": 0.7702864408493042,
"rewards/rna_reward_fn/std": 0.28576594591140747,
"step": 145
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 152.46875,
"completions/mean_terminated_length": 152.46875,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"entropy": 0.13646821677684784,
"epoch": 1.7176470588235295,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5821676850318909,
"learning_rate": 4.313725490196078e-07,
"loss": -0.0,
"num_tokens": 22745920.0,
"reward": 0.6735475659370422,
"reward_std": 0.2079792022705078,
"rewards/rna_reward_fn/mean": 0.6735475659370422,
"rewards/rna_reward_fn/std": 0.34127116203308105,
"step": 146
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 398.0,
"completions/max_terminated_length": 398.0,
"completions/mean_length": 137.0625,
"completions/mean_terminated_length": 137.0625,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"entropy": 0.1294446587562561,
"epoch": 1.7294117647058824,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.47053244709968567,
"learning_rate": 4.274509803921568e-07,
"loss": 0.0,
"num_tokens": 22887296.0,
"reward": 0.7310217618942261,
"reward_std": 0.16372641921043396,
"rewards/rna_reward_fn/mean": 0.7310217618942261,
"rewards/rna_reward_fn/std": 0.29399389028549194,
"step": 147
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 164.03125,
"completions/mean_terminated_length": 164.03125,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"entropy": 0.16281016170978546,
"epoch": 1.7411764705882353,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5588626861572266,
"learning_rate": 4.2352941176470586e-07,
"loss": 0.0,
"num_tokens": 23056288.0,
"reward": 0.654833197593689,
"reward_std": 0.1884084939956665,
"rewards/rna_reward_fn/mean": 0.654833197593689,
"rewards/rna_reward_fn/std": 0.3517378270626068,
"step": 148
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 140.84375,
"completions/mean_terminated_length": 140.84375,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"entropy": 0.15908341854810715,
"epoch": 1.7529411764705882,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5507121086120605,
"learning_rate": 4.196078431372549e-07,
"loss": 0.0,
"num_tokens": 23201536.0,
"reward": 0.699113667011261,
"reward_std": 0.20187973976135254,
"rewards/rna_reward_fn/mean": 0.699113667011261,
"rewards/rna_reward_fn/std": 0.3249177634716034,
"step": 149
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 400.0,
"completions/max_terminated_length": 400.0,
"completions/mean_length": 192.4375,
"completions/mean_terminated_length": 192.4375,
"completions/min_length": 40.0,
"completions/min_terminated_length": 40.0,
"entropy": 0.15749355405569077,
"epoch": 1.7647058823529411,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.47758468985557556,
"learning_rate": 4.156862745098039e-07,
"loss": 0.0,
"num_tokens": 23399616.0,
"reward": 0.6602087020874023,
"reward_std": 0.2426632046699524,
"rewards/rna_reward_fn/mean": 0.6602087020874023,
"rewards/rna_reward_fn/std": 0.3394790291786194,
"step": 150
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 186.375,
"completions/mean_terminated_length": 186.375,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"entropy": 0.1590714380145073,
"epoch": 1.776470588235294,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5084402561187744,
"learning_rate": 4.117647058823529e-07,
"loss": 0.0,
"num_tokens": 23591488.0,
"reward": 0.6650402545928955,
"reward_std": 0.18303653597831726,
"rewards/rna_reward_fn/mean": 0.6650401949882507,
"rewards/rna_reward_fn/std": 0.33965203166007996,
"step": 151
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 141.40625,
"completions/mean_terminated_length": 141.40625,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"entropy": 0.14213567227125168,
"epoch": 1.788235294117647,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5413779020309448,
"learning_rate": 4.0784313725490194e-07,
"loss": -0.0,
"num_tokens": 23737312.0,
"reward": 0.6437839865684509,
"reward_std": 0.2132418155670166,
"rewards/rna_reward_fn/mean": 0.6437839865684509,
"rewards/rna_reward_fn/std": 0.3476622402667999,
"step": 152
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 140.75,
"completions/mean_terminated_length": 140.75,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"entropy": 0.14729295670986176,
"epoch": 1.8,
"frac_reward_zero_std": 0.03125,
"grad_norm": 0.48154816031455994,
"learning_rate": 4.03921568627451e-07,
"loss": -0.0,
"num_tokens": 23882464.0,
"reward": 0.6620033979415894,
"reward_std": 0.22405345737934113,
"rewards/rna_reward_fn/mean": 0.6620033979415894,
"rewards/rna_reward_fn/std": 0.3390491306781769,
"step": 153
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 166.46875,
"completions/mean_terminated_length": 166.46875,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"entropy": 0.14903101325035095,
"epoch": 1.811764705882353,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.625751793384552,
"learning_rate": 4e-07,
"loss": -0.0,
"num_tokens": 24053952.0,
"reward": 0.6442551612854004,
"reward_std": 0.17395520210266113,
"rewards/rna_reward_fn/mean": 0.6442551612854004,
"rewards/rna_reward_fn/std": 0.3670194745063782,
"step": 154
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 157.90625,
"completions/mean_terminated_length": 157.90625,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"entropy": 0.15323904901742935,
"epoch": 1.8235294117647058,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.48200494050979614,
"learning_rate": 3.96078431372549e-07,
"loss": -0.0,
"num_tokens": 24216672.0,
"reward": 0.6359031200408936,
"reward_std": 0.17717690765857697,
"rewards/rna_reward_fn/mean": 0.6359031200408936,
"rewards/rna_reward_fn/std": 0.32817214727401733,
"step": 155
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 472.0,
"completions/max_terminated_length": 472.0,
"completions/mean_length": 145.8125,
"completions/mean_terminated_length": 145.8125,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"entropy": 0.1613752394914627,
"epoch": 1.835294117647059,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.46832966804504395,
"learning_rate": 3.92156862745098e-07,
"loss": 0.0,
"num_tokens": 24367008.0,
"reward": 0.7130154371261597,
"reward_std": 0.18193909525871277,
"rewards/rna_reward_fn/mean": 0.7130154371261597,
"rewards/rna_reward_fn/std": 0.3411928117275238,
"step": 156
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 142.46875,
"completions/mean_terminated_length": 142.46875,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"entropy": 0.13961906731128693,
"epoch": 1.8470588235294119,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6261844038963318,
"learning_rate": 3.8823529411764707e-07,
"loss": -0.0,
"num_tokens": 24513920.0,
"reward": 0.711245596408844,
"reward_std": 0.1767653077840805,
"rewards/rna_reward_fn/mean": 0.7112456560134888,
"rewards/rna_reward_fn/std": 0.3348366618156433,
"step": 157
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 412.0,
"completions/max_terminated_length": 412.0,
"completions/mean_length": 152.4375,
"completions/mean_terminated_length": 152.4375,
"completions/min_length": 39.0,
"completions/min_terminated_length": 39.0,
"entropy": 0.1567898690700531,
"epoch": 1.8588235294117648,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5200847387313843,
"learning_rate": 3.8431372549019606e-07,
"loss": -0.0,
"num_tokens": 24671040.0,
"reward": 0.7147434949874878,
"reward_std": 0.14905846118927002,
"rewards/rna_reward_fn/mean": 0.7147434949874878,
"rewards/rna_reward_fn/std": 0.3070945739746094,
"step": 158
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 392.0,
"completions/max_terminated_length": 392.0,
"completions/mean_length": 125.71875,
"completions/mean_terminated_length": 125.71875,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"entropy": 0.133110411465168,
"epoch": 1.8705882352941177,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4239906370639801,
"learning_rate": 3.8039215686274506e-07,
"loss": 0.0,
"num_tokens": 24800800.0,
"reward": 0.640139639377594,
"reward_std": 0.20033451914787292,
"rewards/rna_reward_fn/mean": 0.640139639377594,
"rewards/rna_reward_fn/std": 0.3294910490512848,
"step": 159
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 376.0,
"completions/max_terminated_length": 376.0,
"completions/mean_length": 134.8125,
"completions/mean_terminated_length": 134.8125,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"entropy": 0.12187084183096886,
"epoch": 1.8823529411764706,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.38697147369384766,
"learning_rate": 3.764705882352941e-07,
"loss": -0.0,
"num_tokens": 24939872.0,
"reward": 0.6659330725669861,
"reward_std": 0.16438628733158112,
"rewards/rna_reward_fn/mean": 0.6659330725669861,
"rewards/rna_reward_fn/std": 0.35713815689086914,
"step": 160
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 135.5625,
"completions/mean_terminated_length": 135.5625,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"entropy": 0.13703680038452148,
"epoch": 1.8941176470588235,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4564237594604492,
"learning_rate": 3.7254901960784315e-07,
"loss": 0.0,
"num_tokens": 25079712.0,
"reward": 0.6596216559410095,
"reward_std": 0.20437049865722656,
"rewards/rna_reward_fn/mean": 0.6596216559410095,
"rewards/rna_reward_fn/std": 0.3517865240573883,
"step": 161
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 177.0625,
"completions/mean_terminated_length": 177.0625,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"entropy": 0.15036547183990479,
"epoch": 1.9058823529411764,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.45348137617111206,
"learning_rate": 3.6862745098039214e-07,
"loss": -0.0,
"num_tokens": 25262048.0,
"reward": 0.6836435198783875,
"reward_std": 0.20624709129333496,
"rewards/rna_reward_fn/mean": 0.6836435198783875,
"rewards/rna_reward_fn/std": 0.32797813415527344,
"step": 162
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 141.71875,
"completions/mean_terminated_length": 141.71875,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"entropy": 0.14257021248340607,
"epoch": 1.9176470588235293,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4581199586391449,
"learning_rate": 3.6470588235294114e-07,
"loss": -0.0,
"num_tokens": 25408192.0,
"reward": 0.6231480836868286,
"reward_std": 0.20732316374778748,
"rewards/rna_reward_fn/mean": 0.6231480836868286,
"rewards/rna_reward_fn/std": 0.35448968410491943,
"step": 163
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 389.0,
"completions/max_terminated_length": 389.0,
"completions/mean_length": 103.90625,
"completions/mean_terminated_length": 103.90625,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"entropy": 0.11931119486689568,
"epoch": 1.9294117647058824,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.42869991064071655,
"learning_rate": 3.607843137254902e-07,
"loss": -0.0,
"num_tokens": 25515616.0,
"reward": 0.7718137502670288,
"reward_std": 0.15544265508651733,
"rewards/rna_reward_fn/mean": 0.7718137502670288,
"rewards/rna_reward_fn/std": 0.2820202112197876,
"step": 164
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 399.0,
"completions/max_terminated_length": 399.0,
"completions/mean_length": 118.34375,
"completions/mean_terminated_length": 118.34375,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"entropy": 0.13630840182304382,
"epoch": 1.9411764705882353,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4796566069126129,
"learning_rate": 3.5686274509803923e-07,
"loss": 0.0,
"num_tokens": 25637824.0,
"reward": 0.7639800310134888,
"reward_std": 0.16217514872550964,
"rewards/rna_reward_fn/mean": 0.7639800310134888,
"rewards/rna_reward_fn/std": 0.2800072729587555,
"step": 165
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 492.0,
"completions/max_terminated_length": 492.0,
"completions/mean_length": 196.1875,
"completions/mean_terminated_length": 196.1875,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"entropy": 0.1692701205611229,
"epoch": 1.9529411764705882,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.576678991317749,
"learning_rate": 3.529411764705882e-07,
"loss": 0.0,
"num_tokens": 25839744.0,
"reward": 0.62703537940979,
"reward_std": 0.24643635749816895,
"rewards/rna_reward_fn/mean": 0.62703537940979,
"rewards/rna_reward_fn/std": 0.3669246435165405,
"step": 166
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 167.96875,
"completions/mean_terminated_length": 167.96875,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"entropy": 0.16024480760097504,
"epoch": 1.9647058823529413,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7311699390411377,
"learning_rate": 3.490196078431372e-07,
"loss": 0.0,
"num_tokens": 26012768.0,
"reward": 0.6588948369026184,
"reward_std": 0.1576000452041626,
"rewards/rna_reward_fn/mean": 0.6588948965072632,
"rewards/rna_reward_fn/std": 0.32907265424728394,
"step": 167
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 201.5,
"completions/mean_terminated_length": 201.5,
"completions/min_length": 48.0,
"completions/min_terminated_length": 48.0,
"entropy": 0.1511036530137062,
"epoch": 1.9764705882352942,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4694945216178894,
"learning_rate": 3.4509803921568627e-07,
"loss": 0.0,
"num_tokens": 26220128.0,
"reward": 0.6976197957992554,
"reward_std": 0.19369524717330933,
"rewards/rna_reward_fn/mean": 0.6976197957992554,
"rewards/rna_reward_fn/std": 0.32611048221588135,
"step": 168
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 400.0,
"completions/max_terminated_length": 400.0,
"completions/mean_length": 154.5,
"completions/mean_terminated_length": 154.5,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"entropy": 0.15085221827030182,
"epoch": 1.988235294117647,
"frac_reward_zero_std": 0.03125,
"grad_norm": 0.7034254670143127,
"learning_rate": 3.411764705882353e-07,
"loss": 0.0,
"num_tokens": 26379360.0,
"reward": 0.6942508220672607,
"reward_std": 0.20178331434726715,
"rewards/rna_reward_fn/mean": 0.6942508220672607,
"rewards/rna_reward_fn/std": 0.31030499935150146,
"step": 169
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 490.0,
"completions/max_terminated_length": 490.0,
"completions/mean_length": 160.53125,
"completions/mean_terminated_length": 160.53125,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"entropy": 0.15548591315746307,
"epoch": 2.0,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5434289574623108,
"learning_rate": 3.372549019607843e-07,
"loss": -0.0,
"num_tokens": 26544768.0,
"reward": 0.6601583957672119,
"reward_std": 0.15550854802131653,
"rewards/rna_reward_fn/mean": 0.6601583361625671,
"rewards/rna_reward_fn/std": 0.3311554193496704,
"step": 170
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 160.40625,
"completions/mean_terminated_length": 160.40625,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"entropy": 0.1544594094157219,
"epoch": 2.011764705882353,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6815203428268433,
"learning_rate": 3.333333333333333e-07,
"loss": 0.0,
"num_tokens": 26710048.0,
"reward": 0.5972940921783447,
"reward_std": 0.18555977940559387,
"rewards/rna_reward_fn/mean": 0.5972940921783447,
"rewards/rna_reward_fn/std": 0.36445632576942444,
"step": 171
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 157.40625,
"completions/mean_terminated_length": 157.40625,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"entropy": 0.14051128178834915,
"epoch": 2.023529411764706,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5093562602996826,
"learning_rate": 3.2941176470588235e-07,
"loss": 0.0,
"num_tokens": 26872256.0,
"reward": 0.6649138927459717,
"reward_std": 0.2001783400774002,
"rewards/rna_reward_fn/mean": 0.6649138331413269,
"rewards/rna_reward_fn/std": 0.3582386374473572,
"step": 172
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 173.75,
"completions/mean_terminated_length": 173.75,
"completions/min_length": 39.0,
"completions/min_terminated_length": 39.0,
"entropy": 0.14279819279909134,
"epoch": 2.0352941176470587,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4454724192619324,
"learning_rate": 3.2549019607843134e-07,
"loss": -0.0,
"num_tokens": 27051200.0,
"reward": 0.7748029828071594,
"reward_std": 0.14138856530189514,
"rewards/rna_reward_fn/mean": 0.7748030424118042,
"rewards/rna_reward_fn/std": 0.2777082026004791,
"step": 173
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 165.8125,
"completions/mean_terminated_length": 165.8125,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"entropy": 0.13190212100744247,
"epoch": 2.0470588235294116,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4604037404060364,
"learning_rate": 3.215686274509804e-07,
"loss": 0.0,
"num_tokens": 27222016.0,
"reward": 0.6792135238647461,
"reward_std": 0.17050443589687347,
"rewards/rna_reward_fn/mean": 0.6792135834693909,
"rewards/rna_reward_fn/std": 0.3469991087913513,
"step": 174
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 140.21875,
"completions/mean_terminated_length": 140.21875,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"entropy": 0.11882514134049416,
"epoch": 2.0588235294117645,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.42415928840637207,
"learning_rate": 3.176470588235294e-07,
"loss": -0.0,
"num_tokens": 27366624.0,
"reward": 0.618835985660553,
"reward_std": 0.19730809330940247,
"rewards/rna_reward_fn/mean": 0.6188360452651978,
"rewards/rna_reward_fn/std": 0.3514353334903717,
"step": 175
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 154.25,
"completions/mean_terminated_length": 154.25,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"entropy": 0.12727607041597366,
"epoch": 2.070588235294118,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5830354690551758,
"learning_rate": 3.1372549019607843e-07,
"loss": 0.0,
"num_tokens": 27525600.0,
"reward": 0.6785444617271423,
"reward_std": 0.18948182463645935,
"rewards/rna_reward_fn/mean": 0.6785444617271423,
"rewards/rna_reward_fn/std": 0.3351566791534424,
"step": 176
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 147.78125,
"completions/mean_terminated_length": 147.78125,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"entropy": 0.14719800651073456,
"epoch": 2.0823529411764707,
"frac_reward_zero_std": 0.03125,
"grad_norm": 0.4794676899909973,
"learning_rate": 3.098039215686274e-07,
"loss": 0.0,
"num_tokens": 27677952.0,
"reward": 0.7077100276947021,
"reward_std": 0.1931176781654358,
"rewards/rna_reward_fn/mean": 0.7077100276947021,
"rewards/rna_reward_fn/std": 0.3137640357017517,
"step": 177
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 142.46875,
"completions/mean_terminated_length": 142.46875,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"entropy": 0.15307611972093582,
"epoch": 2.0941176470588236,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6268736720085144,
"learning_rate": 3.0588235294117647e-07,
"loss": 0.0,
"num_tokens": 27824864.0,
"reward": 0.7079458236694336,
"reward_std": 0.2219894826412201,
"rewards/rna_reward_fn/mean": 0.7079458236694336,
"rewards/rna_reward_fn/std": 0.3472329080104828,
"step": 178
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 164.0,
"completions/mean_terminated_length": 164.0,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"entropy": 0.13749201595783234,
"epoch": 2.1058823529411765,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5293802618980408,
"learning_rate": 3.0196078431372546e-07,
"loss": 0.0,
"num_tokens": 27993824.0,
"reward": 0.6385776996612549,
"reward_std": 0.2456386685371399,
"rewards/rna_reward_fn/mean": 0.6385776996612549,
"rewards/rna_reward_fn/std": 0.36081886291503906,
"step": 179
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 140.21875,
"completions/mean_terminated_length": 140.21875,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"entropy": 0.1387496143579483,
"epoch": 2.1176470588235294,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.538530707359314,
"learning_rate": 2.980392156862745e-07,
"loss": -0.0,
"num_tokens": 28138432.0,
"reward": 0.6739398241043091,
"reward_std": 0.21720820665359497,
"rewards/rna_reward_fn/mean": 0.6739398837089539,
"rewards/rna_reward_fn/std": 0.30697187781333923,
"step": 180
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 118.65625,
"completions/mean_terminated_length": 118.65625,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"entropy": 0.11488081514835358,
"epoch": 2.1294117647058823,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.42285630106925964,
"learning_rate": 2.941176470588235e-07,
"loss": -0.0,
"num_tokens": 28260960.0,
"reward": 0.7317262887954712,
"reward_std": 0.20456328988075256,
"rewards/rna_reward_fn/mean": 0.7317262887954712,
"rewards/rna_reward_fn/std": 0.2935360074043274,
"step": 181
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 384.0,
"completions/max_terminated_length": 384.0,
"completions/mean_length": 128.8125,
"completions/mean_terminated_length": 128.8125,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"entropy": 0.13038966059684753,
"epoch": 2.1411764705882352,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.43837785720825195,
"learning_rate": 2.9019607843137255e-07,
"loss": 0.0,
"num_tokens": 28393888.0,
"reward": 0.7334122657775879,
"reward_std": 0.1874283403158188,
"rewards/rna_reward_fn/mean": 0.7334122657775879,
"rewards/rna_reward_fn/std": 0.3205217123031616,
"step": 182
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 142.1875,
"completions/mean_terminated_length": 142.1875,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"entropy": 0.142289437353611,
"epoch": 2.152941176470588,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4689069092273712,
"learning_rate": 2.8627450980392154e-07,
"loss": -0.0,
"num_tokens": 28540512.0,
"reward": 0.738664448261261,
"reward_std": 0.16794101893901825,
"rewards/rna_reward_fn/mean": 0.7386645078659058,
"rewards/rna_reward_fn/std": 0.30475351214408875,
"step": 183
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 150.1875,
"completions/mean_terminated_length": 150.1875,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"entropy": 0.13591318577528,
"epoch": 2.164705882352941,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.48003292083740234,
"learning_rate": 2.823529411764706e-07,
"loss": -0.0,
"num_tokens": 28695328.0,
"reward": 0.6993162631988525,
"reward_std": 0.1979941427707672,
"rewards/rna_reward_fn/mean": 0.6993162035942078,
"rewards/rna_reward_fn/std": 0.31292685866355896,
"step": 184
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 472.0,
"completions/max_terminated_length": 472.0,
"completions/mean_length": 173.65625,
"completions/mean_terminated_length": 173.65625,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"entropy": 0.15518562495708466,
"epoch": 2.176470588235294,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6343421339988708,
"learning_rate": 2.784313725490196e-07,
"loss": -0.0,
"num_tokens": 28874176.0,
"reward": 0.7311723232269287,
"reward_std": 0.2127300500869751,
"rewards/rna_reward_fn/mean": 0.7311723232269287,
"rewards/rna_reward_fn/std": 0.3124001622200012,
"step": 185
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 137.5625,
"completions/mean_terminated_length": 137.5625,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"entropy": 0.1409146785736084,
"epoch": 2.1882352941176473,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.46661409735679626,
"learning_rate": 2.7450980392156863e-07,
"loss": -0.0,
"num_tokens": 29016064.0,
"reward": 0.7118009328842163,
"reward_std": 0.16496126353740692,
"rewards/rna_reward_fn/mean": 0.7118009328842163,
"rewards/rna_reward_fn/std": 0.32205572724342346,
"step": 186
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 151.21875,
"completions/mean_terminated_length": 151.21875,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"entropy": 0.14989649504423141,
"epoch": 2.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.44188031554222107,
"learning_rate": 2.705882352941176e-07,
"loss": -0.0,
"num_tokens": 29171936.0,
"reward": 0.7327808141708374,
"reward_std": 0.17523989081382751,
"rewards/rna_reward_fn/mean": 0.7327808141708374,
"rewards/rna_reward_fn/std": 0.32806655764579773,
"step": 187
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 389.0,
"completions/max_terminated_length": 389.0,
"completions/mean_length": 157.84375,
"completions/mean_terminated_length": 157.84375,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"entropy": 0.14322884380817413,
"epoch": 2.211764705882353,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5148700475692749,
"learning_rate": 2.6666666666666667e-07,
"loss": -0.0,
"num_tokens": 29334592.0,
"reward": 0.6917252540588379,
"reward_std": 0.17680642008781433,
"rewards/rna_reward_fn/mean": 0.6917252540588379,
"rewards/rna_reward_fn/std": 0.30800244212150574,
"step": 188
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 164.0,
"completions/mean_terminated_length": 164.0,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"entropy": 0.14842171967029572,
"epoch": 2.223529411764706,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5274482369422913,
"learning_rate": 2.6274509803921567e-07,
"loss": 0.0,
"num_tokens": 29503552.0,
"reward": 0.7333264350891113,
"reward_std": 0.17190617322921753,
"rewards/rna_reward_fn/mean": 0.7333264350891113,
"rewards/rna_reward_fn/std": 0.26974406838417053,
"step": 189
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 167.875,
"completions/mean_terminated_length": 167.875,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"entropy": 0.12728291004896164,
"epoch": 2.235294117647059,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4334995746612549,
"learning_rate": 2.588235294117647e-07,
"loss": -0.0,
"num_tokens": 29676480.0,
"reward": 0.6551768779754639,
"reward_std": 0.18493275344371796,
"rewards/rna_reward_fn/mean": 0.6551768779754639,
"rewards/rna_reward_fn/std": 0.33756914734840393,
"step": 190
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 142.59375,
"completions/mean_terminated_length": 142.59375,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"entropy": 0.13632921129465103,
"epoch": 2.2470588235294118,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5202718377113342,
"learning_rate": 2.549019607843137e-07,
"loss": -0.0,
"num_tokens": 29823520.0,
"reward": 0.779222309589386,
"reward_std": 0.1619720160961151,
"rewards/rna_reward_fn/mean": 0.779222309589386,
"rewards/rna_reward_fn/std": 0.255502849817276,
"step": 191
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 381.0,
"completions/max_terminated_length": 381.0,
"completions/mean_length": 141.8125,
"completions/mean_terminated_length": 141.8125,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"entropy": 0.1468304842710495,
"epoch": 2.2588235294117647,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4959217309951782,
"learning_rate": 2.5098039215686275e-07,
"loss": 0.0,
"num_tokens": 29969760.0,
"reward": 0.6328116655349731,
"reward_std": 0.20429277420043945,
"rewards/rna_reward_fn/mean": 0.6328116655349731,
"rewards/rna_reward_fn/std": 0.3653068244457245,
"step": 192
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 472.0,
"completions/max_terminated_length": 472.0,
"completions/mean_length": 147.03125,
"completions/mean_terminated_length": 147.03125,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"entropy": 0.14507943391799927,
"epoch": 2.2705882352941176,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.46249526739120483,
"learning_rate": 2.4705882352941175e-07,
"loss": -0.0,
"num_tokens": 30121344.0,
"reward": 0.6946768760681152,
"reward_std": 0.16386722028255463,
"rewards/rna_reward_fn/mean": 0.6946768760681152,
"rewards/rna_reward_fn/std": 0.3166311979293823,
"step": 193
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 347.0,
"completions/max_terminated_length": 347.0,
"completions/mean_length": 119.1875,
"completions/mean_terminated_length": 119.1875,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"entropy": 0.1289873719215393,
"epoch": 2.2823529411764705,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.43832215666770935,
"learning_rate": 2.431372549019608e-07,
"loss": -0.0,
"num_tokens": 30244416.0,
"reward": 0.7309268116950989,
"reward_std": 0.16351744532585144,
"rewards/rna_reward_fn/mean": 0.7309267520904541,
"rewards/rna_reward_fn/std": 0.27468279004096985,
"step": 194
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 132.40625,
"completions/mean_terminated_length": 132.40625,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"entropy": 0.14909712970256805,
"epoch": 2.2941176470588234,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4866437613964081,
"learning_rate": 2.392156862745098e-07,
"loss": -0.0,
"num_tokens": 30381024.0,
"reward": 0.6669021844863892,
"reward_std": 0.19414769113063812,
"rewards/rna_reward_fn/mean": 0.6669021844863892,
"rewards/rna_reward_fn/std": 0.3391817808151245,
"step": 195
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 174.0,
"completions/mean_terminated_length": 174.0,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"entropy": 0.14798294007778168,
"epoch": 2.3058823529411763,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.590640127658844,
"learning_rate": 2.352941176470588e-07,
"loss": -0.0,
"num_tokens": 30560224.0,
"reward": 0.6385676860809326,
"reward_std": 0.20142759382724762,
"rewards/rna_reward_fn/mean": 0.6385676860809326,
"rewards/rna_reward_fn/std": 0.34272608160972595,
"step": 196
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 125.125,
"completions/mean_terminated_length": 125.125,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"entropy": 0.1469191089272499,
"epoch": 2.317647058823529,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4721366763114929,
"learning_rate": 2.3137254901960785e-07,
"loss": -0.0,
"num_tokens": 30689376.0,
"reward": 0.7269188165664673,
"reward_std": 0.19917072355747223,
"rewards/rna_reward_fn/mean": 0.7269188165664673,
"rewards/rna_reward_fn/std": 0.3235536217689514,
"step": 197
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 170.21875,
"completions/mean_terminated_length": 170.21875,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"entropy": 0.1481616050004959,
"epoch": 2.3294117647058825,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4824952483177185,
"learning_rate": 2.2745098039215685e-07,
"loss": 0.0,
"num_tokens": 30864704.0,
"reward": 0.7315170764923096,
"reward_std": 0.19473856687545776,
"rewards/rna_reward_fn/mean": 0.7315171360969543,
"rewards/rna_reward_fn/std": 0.31163889169692993,
"step": 198
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 124.21875,
"completions/mean_terminated_length": 124.21875,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"entropy": 0.11309440433979034,
"epoch": 2.3411764705882354,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.43292057514190674,
"learning_rate": 2.235294117647059e-07,
"loss": -0.0,
"num_tokens": 30992928.0,
"reward": 0.6969711184501648,
"reward_std": 0.18462812900543213,
"rewards/rna_reward_fn/mean": 0.6969711780548096,
"rewards/rna_reward_fn/std": 0.30229660868644714,
"step": 199
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 295.0,
"completions/max_terminated_length": 295.0,
"completions/mean_length": 115.625,
"completions/mean_terminated_length": 115.625,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"entropy": 0.1170443557202816,
"epoch": 2.3529411764705883,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.42780736088752747,
"learning_rate": 2.196078431372549e-07,
"loss": 0.0,
"num_tokens": 31112352.0,
"reward": 0.7397186160087585,
"reward_std": 0.16325643658638,
"rewards/rna_reward_fn/mean": 0.7397185564041138,
"rewards/rna_reward_fn/std": 0.2868645191192627,
"step": 200
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 191.78125,
"completions/mean_terminated_length": 191.78125,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"entropy": 0.158894345164299,
"epoch": 2.364705882352941,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5923020243644714,
"learning_rate": 2.156862745098039e-07,
"loss": 0.0,
"num_tokens": 31309760.0,
"reward": 0.713019609451294,
"reward_std": 0.1600976586341858,
"rewards/rna_reward_fn/mean": 0.7130196690559387,
"rewards/rna_reward_fn/std": 0.3151859641075134,
"step": 201
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 500.0,
"completions/max_terminated_length": 500.0,
"completions/mean_length": 167.15625,
"completions/mean_terminated_length": 167.15625,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"entropy": 0.15573827922344208,
"epoch": 2.376470588235294,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5989984273910522,
"learning_rate": 2.1176470588235293e-07,
"loss": -0.0,
"num_tokens": 31481952.0,
"reward": 0.7245238423347473,
"reward_std": 0.21510586142539978,
"rewards/rna_reward_fn/mean": 0.7245238423347473,
"rewards/rna_reward_fn/std": 0.3133554756641388,
"step": 202
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 336.0,
"completions/max_terminated_length": 336.0,
"completions/mean_length": 147.15625,
"completions/mean_terminated_length": 147.15625,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"entropy": 0.14043358713388443,
"epoch": 2.388235294117647,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.45242446660995483,
"learning_rate": 2.0784313725490195e-07,
"loss": 0.0,
"num_tokens": 31633664.0,
"reward": 0.6685344576835632,
"reward_std": 0.19693541526794434,
"rewards/rna_reward_fn/mean": 0.6685344576835632,
"rewards/rna_reward_fn/std": 0.33878231048583984,
"step": 203
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 160.78125,
"completions/mean_terminated_length": 160.78125,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"entropy": 0.14151378720998764,
"epoch": 2.4,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.578268826007843,
"learning_rate": 2.0392156862745097e-07,
"loss": 0.0,
"num_tokens": 31799328.0,
"reward": 0.753953218460083,
"reward_std": 0.14072492718696594,
"rewards/rna_reward_fn/mean": 0.753953218460083,
"rewards/rna_reward_fn/std": 0.323638916015625,
"step": 204
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 116.71875,
"completions/mean_terminated_length": 116.71875,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"entropy": 0.14078038185834885,
"epoch": 2.411764705882353,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5669292211532593,
"learning_rate": 2e-07,
"loss": 0.0,
"num_tokens": 31919872.0,
"reward": 0.7278470993041992,
"reward_std": 0.18851059675216675,
"rewards/rna_reward_fn/mean": 0.7278470993041992,
"rewards/rna_reward_fn/std": 0.31520187854766846,
"step": 205
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 165.1875,
"completions/mean_terminated_length": 165.1875,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"entropy": 0.1560438796877861,
"epoch": 2.4235294117647057,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5335204005241394,
"learning_rate": 1.96078431372549e-07,
"loss": -0.0,
"num_tokens": 32090048.0,
"reward": 0.74782395362854,
"reward_std": 0.16413238644599915,
"rewards/rna_reward_fn/mean": 0.74782395362854,
"rewards/rna_reward_fn/std": 0.27966901659965515,
"step": 206
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 129.75,
"completions/mean_terminated_length": 129.75,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"entropy": 0.13756585866212845,
"epoch": 2.435294117647059,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4791547358036041,
"learning_rate": 1.9215686274509803e-07,
"loss": -0.0,
"num_tokens": 32223936.0,
"reward": 0.7443541884422302,
"reward_std": 0.20347487926483154,
"rewards/rna_reward_fn/mean": 0.744354248046875,
"rewards/rna_reward_fn/std": 0.2934330999851227,
"step": 207
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 144.46875,
"completions/mean_terminated_length": 144.46875,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"entropy": 0.14090368151664734,
"epoch": 2.447058823529412,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.48767152428627014,
"learning_rate": 1.8823529411764705e-07,
"loss": -0.0,
"num_tokens": 32372896.0,
"reward": 0.7094341516494751,
"reward_std": 0.1646713763475418,
"rewards/rna_reward_fn/mean": 0.7094341516494751,
"rewards/rna_reward_fn/std": 0.31243574619293213,
"step": 208
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 121.375,
"completions/mean_terminated_length": 121.375,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"entropy": 0.13812856376171112,
"epoch": 2.458823529411765,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.43114832043647766,
"learning_rate": 1.8431372549019607e-07,
"loss": -0.0,
"num_tokens": 32498208.0,
"reward": 0.7636112570762634,
"reward_std": 0.1354459822177887,
"rewards/rna_reward_fn/mean": 0.7636112570762634,
"rewards/rna_reward_fn/std": 0.2837965786457062,
"step": 209
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 157.9375,
"completions/mean_terminated_length": 157.9375,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"entropy": 0.12325883284211159,
"epoch": 2.4705882352941178,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7042959928512573,
"learning_rate": 1.803921568627451e-07,
"loss": -0.0,
"num_tokens": 32660960.0,
"reward": 0.685276985168457,
"reward_std": 0.14444154500961304,
"rewards/rna_reward_fn/mean": 0.685276985168457,
"rewards/rna_reward_fn/std": 0.3264351785182953,
"step": 210
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 398.0,
"completions/max_terminated_length": 398.0,
"completions/mean_length": 149.28125,
"completions/mean_terminated_length": 149.28125,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"entropy": 0.14060577005147934,
"epoch": 2.4823529411764707,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7576245665550232,
"learning_rate": 1.764705882352941e-07,
"loss": 0.0,
"num_tokens": 32814848.0,
"reward": 0.7403950691223145,
"reward_std": 0.19349028170108795,
"rewards/rna_reward_fn/mean": 0.7403950691223145,
"rewards/rna_reward_fn/std": 0.31960996985435486,
"step": 211
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 140.09375,
"completions/mean_terminated_length": 140.09375,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"entropy": 0.128474622964859,
"epoch": 2.4941176470588236,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4394446611404419,
"learning_rate": 1.7254901960784313e-07,
"loss": -0.0,
"num_tokens": 32959328.0,
"reward": 0.7468061447143555,
"reward_std": 0.13857056200504303,
"rewards/rna_reward_fn/mean": 0.7468062043190002,
"rewards/rna_reward_fn/std": 0.2608503997325897,
"step": 212
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 144.03125,
"completions/mean_terminated_length": 144.03125,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"entropy": 0.14114519208669662,
"epoch": 2.5058823529411764,
"frac_reward_zero_std": 0.03125,
"grad_norm": 0.5121099352836609,
"learning_rate": 1.6862745098039215e-07,
"loss": 0.0,
"num_tokens": 33107840.0,
"reward": 0.6896160244941711,
"reward_std": 0.17474885284900665,
"rewards/rna_reward_fn/mean": 0.6896160244941711,
"rewards/rna_reward_fn/std": 0.30136245489120483,
"step": 213
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 196.625,
"completions/mean_terminated_length": 196.625,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"entropy": 0.1554037183523178,
"epoch": 2.5176470588235293,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5231500864028931,
"learning_rate": 1.6470588235294117e-07,
"loss": 0.0,
"num_tokens": 33310208.0,
"reward": 0.7346584796905518,
"reward_std": 0.20079070329666138,
"rewards/rna_reward_fn/mean": 0.7346584796905518,
"rewards/rna_reward_fn/std": 0.30361971259117126,
"step": 214
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 138.9375,
"completions/mean_terminated_length": 138.9375,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"entropy": 0.12060126662254333,
"epoch": 2.5294117647058822,
"frac_reward_zero_std": 0.03125,
"grad_norm": 0.45047426223754883,
"learning_rate": 1.607843137254902e-07,
"loss": 0.0,
"num_tokens": 33453504.0,
"reward": 0.768707275390625,
"reward_std": 0.13694067299365997,
"rewards/rna_reward_fn/mean": 0.7687073349952698,
"rewards/rna_reward_fn/std": 0.27220436930656433,
"step": 215
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 198.78125,
"completions/mean_terminated_length": 198.78125,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"entropy": 0.1575038880109787,
"epoch": 2.541176470588235,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5329861044883728,
"learning_rate": 1.5686274509803921e-07,
"loss": -0.0,
"num_tokens": 33658080.0,
"reward": 0.7541199922561646,
"reward_std": 0.15449070930480957,
"rewards/rna_reward_fn/mean": 0.7541199922561646,
"rewards/rna_reward_fn/std": 0.2656092345714569,
"step": 216
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 412.0,
"completions/max_terminated_length": 412.0,
"completions/mean_length": 158.34375,
"completions/mean_terminated_length": 158.34375,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"entropy": 0.15501223504543304,
"epoch": 2.552941176470588,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.45992547273635864,
"learning_rate": 1.5294117647058823e-07,
"loss": 0.0,
"num_tokens": 33821248.0,
"reward": 0.7572486400604248,
"reward_std": 0.15161246061325073,
"rewards/rna_reward_fn/mean": 0.7572486400604248,
"rewards/rna_reward_fn/std": 0.29167696833610535,
"step": 217
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 169.625,
"completions/mean_terminated_length": 169.625,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"entropy": 0.13358986377716064,
"epoch": 2.564705882352941,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8965858817100525,
"learning_rate": 1.4901960784313725e-07,
"loss": -0.0,
"num_tokens": 33995968.0,
"reward": 0.7292990684509277,
"reward_std": 0.16865938901901245,
"rewards/rna_reward_fn/mean": 0.7292990684509277,
"rewards/rna_reward_fn/std": 0.30115416646003723,
"step": 218
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 176.78125,
"completions/mean_terminated_length": 176.78125,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"entropy": 0.13434413820505142,
"epoch": 2.576470588235294,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.582165002822876,
"learning_rate": 1.4509803921568628e-07,
"loss": -0.0,
"num_tokens": 34178016.0,
"reward": 0.6599196195602417,
"reward_std": 0.196761354804039,
"rewards/rna_reward_fn/mean": 0.6599196791648865,
"rewards/rna_reward_fn/std": 0.33999550342559814,
"step": 219
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 156.1875,
"completions/mean_terminated_length": 156.1875,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"entropy": 0.1357617899775505,
"epoch": 2.588235294117647,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5189464688301086,
"learning_rate": 1.411764705882353e-07,
"loss": 0.0,
"num_tokens": 34338976.0,
"reward": 0.7549696564674377,
"reward_std": 0.1326015144586563,
"rewards/rna_reward_fn/mean": 0.7549696564674377,
"rewards/rna_reward_fn/std": 0.2852962613105774,
"step": 220
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 148.15625,
"completions/mean_terminated_length": 148.15625,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"entropy": 0.15427181124687195,
"epoch": 2.6,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.536194920539856,
"learning_rate": 1.3725490196078432e-07,
"loss": 0.0,
"num_tokens": 34491712.0,
"reward": 0.7131255865097046,
"reward_std": 0.14100758731365204,
"rewards/rna_reward_fn/mean": 0.7131255865097046,
"rewards/rna_reward_fn/std": 0.31784212589263916,
"step": 221
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 380.0,
"completions/max_terminated_length": 380.0,
"completions/mean_length": 145.1875,
"completions/mean_terminated_length": 145.1875,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"entropy": 0.13709458708763123,
"epoch": 2.611764705882353,
"frac_reward_zero_std": 0.03125,
"grad_norm": 0.5712235569953918,
"learning_rate": 1.3333333333333334e-07,
"loss": 0.0,
"num_tokens": 34641408.0,
"reward": 0.7191460132598877,
"reward_std": 0.16943207383155823,
"rewards/rna_reward_fn/mean": 0.7191460132598877,
"rewards/rna_reward_fn/std": 0.3015574514865875,
"step": 222
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 145.1875,
"completions/mean_terminated_length": 145.1875,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"entropy": 0.13566020876169205,
"epoch": 2.623529411764706,
"frac_reward_zero_std": 0.03125,
"grad_norm": 0.4192090630531311,
"learning_rate": 1.2941176470588236e-07,
"loss": 0.0,
"num_tokens": 34791104.0,
"reward": 0.7555572986602783,
"reward_std": 0.16786056756973267,
"rewards/rna_reward_fn/mean": 0.7555572986602783,
"rewards/rna_reward_fn/std": 0.2797638177871704,
"step": 223
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 165.09375,
"completions/mean_terminated_length": 165.09375,
"completions/min_length": 41.0,
"completions/min_terminated_length": 41.0,
"entropy": 0.12663453072309494,
"epoch": 2.635294117647059,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6057937145233154,
"learning_rate": 1.2549019607843138e-07,
"loss": -0.0,
"num_tokens": 34961184.0,
"reward": 0.6839346289634705,
"reward_std": 0.19452279806137085,
"rewards/rna_reward_fn/mean": 0.6839346289634705,
"rewards/rna_reward_fn/std": 0.33146002888679504,
"step": 224
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 167.65625,
"completions/mean_terminated_length": 167.65625,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"entropy": 0.1426771581172943,
"epoch": 2.6470588235294117,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4763612747192383,
"learning_rate": 1.215686274509804e-07,
"loss": 0.0,
"num_tokens": 35133888.0,
"reward": 0.6619032621383667,
"reward_std": 0.17893120646476746,
"rewards/rna_reward_fn/mean": 0.6619032621383667,
"rewards/rna_reward_fn/std": 0.3283209800720215,
"step": 225
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 399.0,
"completions/max_terminated_length": 399.0,
"completions/mean_length": 149.9375,
"completions/mean_terminated_length": 149.9375,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"entropy": 0.14778528362512589,
"epoch": 2.6588235294117646,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4169410169124603,
"learning_rate": 1.176470588235294e-07,
"loss": -0.0,
"num_tokens": 35288448.0,
"reward": 0.6732456088066101,
"reward_std": 0.16452832520008087,
"rewards/rna_reward_fn/mean": 0.6732455492019653,
"rewards/rna_reward_fn/std": 0.3249601721763611,
"step": 226
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 145.09375,
"completions/mean_terminated_length": 145.09375,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"entropy": 0.1449032723903656,
"epoch": 2.6705882352941175,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6590065360069275,
"learning_rate": 1.1372549019607842e-07,
"loss": -0.0,
"num_tokens": 35438048.0,
"reward": 0.7874460220336914,
"reward_std": 0.12049897015094757,
"rewards/rna_reward_fn/mean": 0.7874460220336914,
"rewards/rna_reward_fn/std": 0.2661431133747101,
"step": 227
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 151.75,
"completions/mean_terminated_length": 151.75,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"entropy": 0.13789667189121246,
"epoch": 2.682352941176471,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.501124918460846,
"learning_rate": 1.0980392156862744e-07,
"loss": -0.0,
"num_tokens": 35594464.0,
"reward": 0.76551353931427,
"reward_std": 0.14058314263820648,
"rewards/rna_reward_fn/mean": 0.7655135989189148,
"rewards/rna_reward_fn/std": 0.2855876088142395,
"step": 228
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 163.71875,
"completions/mean_terminated_length": 163.71875,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"entropy": 0.14094559848308563,
"epoch": 2.6941176470588237,
"frac_reward_zero_std": 0.03125,
"grad_norm": 0.736441433429718,
"learning_rate": 1.0588235294117647e-07,
"loss": 0.0,
"num_tokens": 35763136.0,
"reward": 0.6939565539360046,
"reward_std": 0.16584208607673645,
"rewards/rna_reward_fn/mean": 0.6939565539360046,
"rewards/rna_reward_fn/std": 0.32086971402168274,
"step": 229
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 139.78125,
"completions/mean_terminated_length": 139.78125,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"entropy": 0.13419293239712715,
"epoch": 2.7058823529411766,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6264002919197083,
"learning_rate": 1.0196078431372549e-07,
"loss": -0.0,
"num_tokens": 35907296.0,
"reward": 0.7488532066345215,
"reward_std": 0.1620199978351593,
"rewards/rna_reward_fn/mean": 0.7488532066345215,
"rewards/rna_reward_fn/std": 0.2980068624019623,
"step": 230
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 397.0,
"completions/max_terminated_length": 397.0,
"completions/mean_length": 137.40625,
"completions/mean_terminated_length": 137.40625,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"entropy": 0.13055864721536636,
"epoch": 2.7176470588235295,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4814888536930084,
"learning_rate": 9.80392156862745e-08,
"loss": 0.0,
"num_tokens": 36049024.0,
"reward": 0.6655980348587036,
"reward_std": 0.15648490190505981,
"rewards/rna_reward_fn/mean": 0.6655980348587036,
"rewards/rna_reward_fn/std": 0.35470837354660034,
"step": 231
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 130.90625,
"completions/mean_terminated_length": 130.90625,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"entropy": 0.12380100041627884,
"epoch": 2.7294117647058824,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.583757221698761,
"learning_rate": 9.411764705882353e-08,
"loss": -0.0,
"num_tokens": 36184096.0,
"reward": 0.7524540424346924,
"reward_std": 0.15423446893692017,
"rewards/rna_reward_fn/mean": 0.7524540424346924,
"rewards/rna_reward_fn/std": 0.28454405069351196,
"step": 232
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 399.0,
"completions/max_terminated_length": 399.0,
"completions/mean_length": 145.34375,
"completions/mean_terminated_length": 145.34375,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"entropy": 0.1325184628367424,
"epoch": 2.7411764705882353,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4390006959438324,
"learning_rate": 9.019607843137255e-08,
"loss": -0.0,
"num_tokens": 36333952.0,
"reward": 0.7277975082397461,
"reward_std": 0.19573622941970825,
"rewards/rna_reward_fn/mean": 0.7277975082397461,
"rewards/rna_reward_fn/std": 0.32145431637763977,
"step": 233
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 168.125,
"completions/mean_terminated_length": 168.125,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"entropy": 0.13657083362340927,
"epoch": 2.7529411764705882,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7681740522384644,
"learning_rate": 8.627450980392157e-08,
"loss": -0.0,
"num_tokens": 36507136.0,
"reward": 0.7168524265289307,
"reward_std": 0.18613344430923462,
"rewards/rna_reward_fn/mean": 0.7168524265289307,
"rewards/rna_reward_fn/std": 0.3243979215621948,
"step": 234
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 163.875,
"completions/mean_terminated_length": 163.875,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"entropy": 0.14333349466323853,
"epoch": 2.764705882352941,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5657479763031006,
"learning_rate": 8.235294117647059e-08,
"loss": 0.0,
"num_tokens": 36675968.0,
"reward": 0.725771427154541,
"reward_std": 0.16519448161125183,
"rewards/rna_reward_fn/mean": 0.725771427154541,
"rewards/rna_reward_fn/std": 0.29766252636909485,
"step": 235
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 418.0,
"completions/max_terminated_length": 418.0,
"completions/mean_length": 156.46875,
"completions/mean_terminated_length": 156.46875,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"entropy": 0.1441263109445572,
"epoch": 2.776470588235294,
"frac_reward_zero_std": 0.03125,
"grad_norm": 0.4572143256664276,
"learning_rate": 7.843137254901961e-08,
"loss": 0.0,
"num_tokens": 36837216.0,
"reward": 0.742597222328186,
"reward_std": 0.16114118695259094,
"rewards/rna_reward_fn/mean": 0.742597222328186,
"rewards/rna_reward_fn/std": 0.29970842599868774,
"step": 236
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 396.0,
"completions/max_terminated_length": 396.0,
"completions/mean_length": 158.1875,
"completions/mean_terminated_length": 158.1875,
"completions/min_length": 39.0,
"completions/min_terminated_length": 39.0,
"entropy": 0.1409977823495865,
"epoch": 2.788235294117647,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.42590776085853577,
"learning_rate": 7.450980392156863e-08,
"loss": -0.0,
"num_tokens": 37000224.0,
"reward": 0.7145720720291138,
"reward_std": 0.164639413356781,
"rewards/rna_reward_fn/mean": 0.7145720720291138,
"rewards/rna_reward_fn/std": 0.3098330497741699,
"step": 237
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 276.0,
"completions/max_terminated_length": 276.0,
"completions/mean_length": 107.21875,
"completions/mean_terminated_length": 107.21875,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"entropy": 0.11754556372761726,
"epoch": 2.8,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4764781892299652,
"learning_rate": 7.058823529411765e-08,
"loss": 0.0,
"num_tokens": 37111040.0,
"reward": 0.7425558567047119,
"reward_std": 0.16547845304012299,
"rewards/rna_reward_fn/mean": 0.7425558567047119,
"rewards/rna_reward_fn/std": 0.3051395118236542,
"step": 238
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 492.0,
"completions/max_terminated_length": 492.0,
"completions/mean_length": 172.84375,
"completions/mean_terminated_length": 172.84375,
"completions/min_length": 42.0,
"completions/min_terminated_length": 42.0,
"entropy": 0.14019257575273514,
"epoch": 2.8117647058823527,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5157439708709717,
"learning_rate": 6.666666666666667e-08,
"loss": -0.0,
"num_tokens": 37289056.0,
"reward": 0.6816315650939941,
"reward_std": 0.2366928905248642,
"rewards/rna_reward_fn/mean": 0.6816315650939941,
"rewards/rna_reward_fn/std": 0.326466828584671,
"step": 239
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 164.15625,
"completions/mean_terminated_length": 164.15625,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"entropy": 0.1466379389166832,
"epoch": 2.8235294117647056,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5139991044998169,
"learning_rate": 6.274509803921569e-08,
"loss": 0.0,
"num_tokens": 37458176.0,
"reward": 0.7532614469528198,
"reward_std": 0.1603999137878418,
"rewards/rna_reward_fn/mean": 0.7532614469528198,
"rewards/rna_reward_fn/std": 0.31244710087776184,
"step": 240
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 157.15625,
"completions/mean_terminated_length": 157.15625,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"entropy": 0.12356984615325928,
"epoch": 2.835294117647059,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9720450043678284,
"learning_rate": 5.88235294117647e-08,
"loss": -0.0,
"num_tokens": 37620128.0,
"reward": 0.7346148490905762,
"reward_std": 0.15429024398326874,
"rewards/rna_reward_fn/mean": 0.7346148490905762,
"rewards/rna_reward_fn/std": 0.31154975295066833,
"step": 241
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 153.5,
"completions/mean_terminated_length": 153.5,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"entropy": 0.1341606229543686,
"epoch": 2.847058823529412,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5591171979904175,
"learning_rate": 5.490196078431372e-08,
"loss": -0.0,
"num_tokens": 37778336.0,
"reward": 0.7116289138793945,
"reward_std": 0.21866443753242493,
"rewards/rna_reward_fn/mean": 0.7116289138793945,
"rewards/rna_reward_fn/std": 0.2980954051017761,
"step": 242
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 482.0,
"completions/max_terminated_length": 482.0,
"completions/mean_length": 203.4375,
"completions/mean_terminated_length": 203.4375,
"completions/min_length": 40.0,
"completions/min_terminated_length": 40.0,
"entropy": 0.14845673739910126,
"epoch": 2.8588235294117648,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5372319221496582,
"learning_rate": 5.098039215686274e-08,
"loss": 0.0,
"num_tokens": 37987680.0,
"reward": 0.7392944693565369,
"reward_std": 0.19700977206230164,
"rewards/rna_reward_fn/mean": 0.7392945289611816,
"rewards/rna_reward_fn/std": 0.30940258502960205,
"step": 243
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 139.6875,
"completions/mean_terminated_length": 139.6875,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"entropy": 0.13047143816947937,
"epoch": 2.8705882352941177,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5987316370010376,
"learning_rate": 4.705882352941176e-08,
"loss": -0.0,
"num_tokens": 38131744.0,
"reward": 0.6977779269218445,
"reward_std": 0.2151854932308197,
"rewards/rna_reward_fn/mean": 0.6977779269218445,
"rewards/rna_reward_fn/std": 0.3459690511226654,
"step": 244
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 490.0,
"completions/max_terminated_length": 490.0,
"completions/mean_length": 148.40625,
"completions/mean_terminated_length": 148.40625,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"entropy": 0.14810562878847122,
"epoch": 2.8823529411764706,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7430775165557861,
"learning_rate": 4.313725490196078e-08,
"loss": -0.0,
"num_tokens": 38284736.0,
"reward": 0.6900802254676819,
"reward_std": 0.18723735213279724,
"rewards/rna_reward_fn/mean": 0.6900802254676819,
"rewards/rna_reward_fn/std": 0.3328934609889984,
"step": 245
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 139.5625,
"completions/mean_terminated_length": 139.5625,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"entropy": 0.12182106822729111,
"epoch": 2.8941176470588235,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.49635204672813416,
"learning_rate": 3.9215686274509804e-08,
"loss": 0.0,
"num_tokens": 38428672.0,
"reward": 0.7072439193725586,
"reward_std": 0.1840672791004181,
"rewards/rna_reward_fn/mean": 0.7072439193725586,
"rewards/rna_reward_fn/std": 0.3065541088581085,
"step": 246
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 167.6875,
"completions/mean_terminated_length": 167.6875,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"entropy": 0.13815301656723022,
"epoch": 2.9058823529411764,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8550586104393005,
"learning_rate": 3.5294117647058824e-08,
"loss": -0.0,
"num_tokens": 38601408.0,
"reward": 0.7532185316085815,
"reward_std": 0.1475568264722824,
"rewards/rna_reward_fn/mean": 0.7532185316085815,
"rewards/rna_reward_fn/std": 0.29489991068840027,
"step": 247
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 340.0,
"completions/max_terminated_length": 340.0,
"completions/mean_length": 122.34375,
"completions/mean_terminated_length": 122.34375,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"entropy": 0.12259503453969955,
"epoch": 2.9176470588235293,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.44689512252807617,
"learning_rate": 3.1372549019607844e-08,
"loss": 0.0,
"num_tokens": 38727712.0,
"reward": 0.7440149784088135,
"reward_std": 0.1674138307571411,
"rewards/rna_reward_fn/mean": 0.7440149188041687,
"rewards/rna_reward_fn/std": 0.3040436804294586,
"step": 248
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 474.0,
"completions/max_terminated_length": 474.0,
"completions/mean_length": 192.0,
"completions/mean_terminated_length": 192.0,
"completions/min_length": 42.0,
"completions/min_terminated_length": 42.0,
"entropy": 0.1282111555337906,
"epoch": 2.9294117647058826,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5679563879966736,
"learning_rate": 2.745098039215686e-08,
"loss": 0.0,
"num_tokens": 38925344.0,
"reward": 0.6850175857543945,
"reward_std": 0.19530020654201508,
"rewards/rna_reward_fn/mean": 0.6850175857543945,
"rewards/rna_reward_fn/std": 0.33921393752098083,
"step": 249
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 117.09375,
"completions/mean_terminated_length": 117.09375,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"entropy": 0.12855321913957596,
"epoch": 2.9411764705882355,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.505153238773346,
"learning_rate": 2.352941176470588e-08,
"loss": -0.0,
"num_tokens": 39046272.0,
"reward": 0.6269246339797974,
"reward_std": 0.16829745471477509,
"rewards/rna_reward_fn/mean": 0.6269246339797974,
"rewards/rna_reward_fn/std": 0.33109787106513977,
"step": 250
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 424.0,
"completions/max_terminated_length": 424.0,
"completions/mean_length": 121.0,
"completions/mean_terminated_length": 121.0,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"entropy": 0.12059168517589569,
"epoch": 2.9529411764705884,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4366406500339508,
"learning_rate": 1.9607843137254902e-08,
"loss": 0.0,
"num_tokens": 39171200.0,
"reward": 0.7053718566894531,
"reward_std": 0.14770260453224182,
"rewards/rna_reward_fn/mean": 0.7053717970848083,
"rewards/rna_reward_fn/std": 0.3234374523162842,
"step": 251
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 327.0,
"completions/max_terminated_length": 327.0,
"completions/mean_length": 132.1875,
"completions/mean_terminated_length": 132.1875,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"entropy": 0.13018939644098282,
"epoch": 2.9647058823529413,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6731492280960083,
"learning_rate": 1.5686274509803922e-08,
"loss": 0.0,
"num_tokens": 39307584.0,
"reward": 0.7679715752601624,
"reward_std": 0.17536047101020813,
"rewards/rna_reward_fn/mean": 0.7679715156555176,
"rewards/rna_reward_fn/std": 0.2801183760166168,
"step": 252
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 145.40625,
"completions/mean_terminated_length": 145.40625,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"entropy": 0.10920717194676399,
"epoch": 2.976470588235294,
"frac_reward_zero_std": 0.03125,
"grad_norm": 0.46245628595352173,
"learning_rate": 1.176470588235294e-08,
"loss": 0.0,
"num_tokens": 39457504.0,
"reward": 0.7559751272201538,
"reward_std": 0.15144692361354828,
"rewards/rna_reward_fn/mean": 0.7559751272201538,
"rewards/rna_reward_fn/std": 0.3152746260166168,
"step": 253
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 193.3125,
"completions/mean_terminated_length": 193.3125,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"entropy": 0.15460387617349625,
"epoch": 2.988235294117647,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6124170422554016,
"learning_rate": 7.843137254901961e-09,
"loss": 0.0,
"num_tokens": 39656480.0,
"reward": 0.7068374752998352,
"reward_std": 0.19490104913711548,
"rewards/rna_reward_fn/mean": 0.7068374752998352,
"rewards/rna_reward_fn/std": 0.310377836227417,
"step": 254
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 149.5,
"completions/mean_terminated_length": 149.5,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"entropy": 0.1327020823955536,
"epoch": 3.0,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5195903778076172,
"learning_rate": 3.9215686274509805e-09,
"loss": -0.0,
"num_tokens": 39810592.0,
"reward": 0.7493961453437805,
"reward_std": 0.17497789859771729,
"rewards/rna_reward_fn/mean": 0.7493961453437805,
"rewards/rna_reward_fn/std": 0.31194695830345154,
"step": 255
}
],
"logging_steps": 1.0,
"max_steps": 255,
"num_input_tokens_seen": 39810592,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 256,
"trial_name": null,
"trial_params": null
}