{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 255, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 116.875, "completions/mean_terminated_length": 116.875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.3960496634244919, "epoch": 0.011764705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 0.3658151626586914, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 120704.0, "reward": 0.42291906476020813, "reward_std": 0.353160560131073, "rewards/rna_reward_fn/mean": 0.42291906476020813, "rewards/rna_reward_fn/std": 0.39480823278427124, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 145.34375, "completions/mean_terminated_length": 145.34375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.3918581157922745, "epoch": 0.023529411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 0.3561055362224579, "learning_rate": 9.96078431372549e-07, "loss": 0.0, "num_tokens": 270560.0, "reward": 0.4679465889930725, "reward_std": 0.304127037525177, "rewards/rna_reward_fn/mean": 0.4679465889930725, "rewards/rna_reward_fn/std": 0.37357842922210693, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 169.4375, "completions/mean_terminated_length": 169.4375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.3528731167316437, "epoch": 0.03529411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 0.3573973476886749, "learning_rate": 9.92156862745098e-07, "loss": 0.0, "num_tokens": 445088.0, "reward": 0.4688035249710083, "reward_std": 0.3215726613998413, "rewards/rna_reward_fn/mean": 0.4688035249710083, "rewards/rna_reward_fn/std": 0.3945569097995758, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 164.53125, "completions/mean_terminated_length": 164.53125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.3565346747636795, "epoch": 0.047058823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 0.37075310945510864, "learning_rate": 9.88235294117647e-07, "loss": -0.0, "num_tokens": 614592.0, "reward": 0.5333437323570251, "reward_std": 0.3202625513076782, "rewards/rna_reward_fn/mean": 0.5333437323570251, "rewards/rna_reward_fn/std": 0.3746815025806427, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 103.3125, "completions/mean_terminated_length": 103.3125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 0.35146908462047577, "epoch": 0.058823529411764705, "frac_reward_zero_std": 0.0, "grad_norm": 0.34449008107185364, "learning_rate": 9.84313725490196e-07, "loss": -0.0, "num_tokens": 721408.0, "reward": 0.5266900062561035, "reward_std": 0.32159364223480225, "rewards/rna_reward_fn/mean": 0.5266900062561035, "rewards/rna_reward_fn/std": 0.3701845705509186, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 161.25, "completions/mean_terminated_length": 161.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.3309106081724167, "epoch": 0.07058823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 0.35763484239578247, "learning_rate": 9.80392156862745e-07, "loss": -0.0, "num_tokens": 887552.0, "reward": 0.5357265472412109, "reward_std": 0.2797412872314453, "rewards/rna_reward_fn/mean": 0.5357265472412109, "rewards/rna_reward_fn/std": 0.3577335476875305, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 151.375, "completions/mean_terminated_length": 151.375, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.34717176854610443, "epoch": 0.08235294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 0.3663802146911621, "learning_rate": 9.76470588235294e-07, "loss": -0.0, "num_tokens": 1043584.0, "reward": 0.547458291053772, "reward_std": 0.2995288372039795, "rewards/rna_reward_fn/mean": 0.547458291053772, "rewards/rna_reward_fn/std": 0.3604092001914978, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 167.125, "completions/mean_terminated_length": 167.125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.31340789794921875, "epoch": 0.09411764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 0.4071066081523895, "learning_rate": 9.725490196078432e-07, "loss": -0.0, "num_tokens": 1215744.0, "reward": 0.5176310539245605, "reward_std": 0.3205966353416443, "rewards/rna_reward_fn/mean": 0.5176310539245605, "rewards/rna_reward_fn/std": 0.3642078638076782, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 151.59375, "completions/mean_terminated_length": 151.59375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 0.305365189909935, "epoch": 0.10588235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 0.3989139795303345, "learning_rate": 9.686274509803921e-07, "loss": -0.0, "num_tokens": 1372000.0, "reward": 0.6008568406105042, "reward_std": 0.30818045139312744, "rewards/rna_reward_fn/mean": 0.6008569002151489, "rewards/rna_reward_fn/std": 0.35290631651878357, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 135.53125, "completions/mean_terminated_length": 135.53125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.2962174266576767, "epoch": 0.11764705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 0.44642144441604614, "learning_rate": 9.64705882352941e-07, "loss": 0.0, "num_tokens": 1511808.0, "reward": 0.540717601776123, "reward_std": 0.3060719966888428, "rewards/rna_reward_fn/mean": 0.540717601776123, "rewards/rna_reward_fn/std": 0.36574023962020874, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 187.71875, "completions/mean_terminated_length": 187.71875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.2934599667787552, "epoch": 0.12941176470588237, "frac_reward_zero_std": 0.0, "grad_norm": 0.3814420700073242, "learning_rate": 9.607843137254902e-07, "loss": -0.0, "num_tokens": 1705056.0, "reward": 0.6084277629852295, "reward_std": 0.3016743063926697, "rewards/rna_reward_fn/mean": 0.6084277629852295, "rewards/rna_reward_fn/std": 0.37008586525917053, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 123.65625, "completions/mean_terminated_length": 123.65625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.28613443672657013, "epoch": 0.1411764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 0.34958702325820923, "learning_rate": 9.568627450980392e-07, "loss": 0.0, "num_tokens": 1832704.0, "reward": 0.6017879247665405, "reward_std": 0.3006741404533386, "rewards/rna_reward_fn/mean": 0.6017879247665405, "rewards/rna_reward_fn/std": 0.35490649938583374, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 140.65625, "completions/mean_terminated_length": 140.65625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 0.277506560087204, "epoch": 0.15294117647058825, "frac_reward_zero_std": 0.0, "grad_norm": 0.5352854132652283, "learning_rate": 9.529411764705881e-07, "loss": 0.0, "num_tokens": 1977760.0, "reward": 0.571915328502655, "reward_std": 0.2985040843486786, "rewards/rna_reward_fn/mean": 0.5719153881072998, "rewards/rna_reward_fn/std": 0.3767135441303253, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 154.03125, "completions/mean_terminated_length": 154.03125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.2907712608575821, "epoch": 0.16470588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 0.40935981273651123, "learning_rate": 9.490196078431371e-07, "loss": 0.0, "num_tokens": 2136512.0, "reward": 0.5937778353691101, "reward_std": 0.270163893699646, "rewards/rna_reward_fn/mean": 0.5937778353691101, "rewards/rna_reward_fn/std": 0.3509018123149872, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 184.40625, "completions/mean_terminated_length": 184.40625, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.27846619486808777, "epoch": 0.17647058823529413, "frac_reward_zero_std": 0.0, "grad_norm": 0.41769424080848694, "learning_rate": 9.450980392156862e-07, "loss": 0.0, "num_tokens": 2326368.0, "reward": 0.6163018941879272, "reward_std": 0.26538053154945374, "rewards/rna_reward_fn/mean": 0.6163018941879272, "rewards/rna_reward_fn/std": 0.3496814966201782, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 117.84375, "completions/mean_terminated_length": 117.84375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.2604786157608032, "epoch": 0.18823529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 0.3845226764678955, "learning_rate": 9.411764705882352e-07, "loss": 0.0, "num_tokens": 2448064.0, "reward": 0.5925071239471436, "reward_std": 0.2943580150604248, "rewards/rna_reward_fn/mean": 0.5925071239471436, "rewards/rna_reward_fn/std": 0.3674796521663666, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 112.125, "completions/mean_terminated_length": 112.125, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.25712524354457855, "epoch": 0.2, "frac_reward_zero_std": 0.0, "grad_norm": 0.39543959498405457, "learning_rate": 9.372549019607843e-07, "loss": -0.0, "num_tokens": 2563904.0, "reward": 0.5904660224914551, "reward_std": 0.26803961396217346, "rewards/rna_reward_fn/mean": 0.5904660224914551, "rewards/rna_reward_fn/std": 0.3583122193813324, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 138.40625, "completions/mean_terminated_length": 138.40625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.27494488656520844, "epoch": 0.21176470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 0.32191383838653564, "learning_rate": 9.333333333333333e-07, "loss": -0.0, "num_tokens": 2706656.0, "reward": 0.6467701196670532, "reward_std": 0.2634694576263428, "rewards/rna_reward_fn/mean": 0.6467701196670532, "rewards/rna_reward_fn/std": 0.3313148319721222, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 137.6875, "completions/mean_terminated_length": 137.6875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.260918065905571, "epoch": 0.2235294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 0.4905475378036499, "learning_rate": 9.294117647058824e-07, "loss": 0.0, "num_tokens": 2848672.0, "reward": 0.5871793031692505, "reward_std": 0.25154006481170654, "rewards/rna_reward_fn/mean": 0.5871793031692505, "rewards/rna_reward_fn/std": 0.3587729334831238, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 166.78125, "completions/mean_terminated_length": 166.78125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.26801037788391113, "epoch": 0.23529411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 0.7330372929573059, "learning_rate": 9.254901960784314e-07, "loss": -0.0, "num_tokens": 3020480.0, "reward": 0.5460379123687744, "reward_std": 0.27695512771606445, "rewards/rna_reward_fn/mean": 0.5460379123687744, "rewards/rna_reward_fn/std": 0.37495046854019165, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 142.6875, "completions/mean_terminated_length": 142.6875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.26508544385433197, "epoch": 0.24705882352941178, "frac_reward_zero_std": 0.0, "grad_norm": 0.4575193524360657, "learning_rate": 9.215686274509803e-07, "loss": 0.0, "num_tokens": 3167616.0, "reward": 0.6192805171012878, "reward_std": 0.2736813426017761, "rewards/rna_reward_fn/mean": 0.6192805171012878, "rewards/rna_reward_fn/std": 0.3539046049118042, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 154.25, "completions/mean_terminated_length": 154.25, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 0.25467583537101746, "epoch": 0.25882352941176473, "frac_reward_zero_std": 0.0, "grad_norm": 0.407436341047287, "learning_rate": 9.176470588235293e-07, "loss": 0.0, "num_tokens": 3326592.0, "reward": 0.5778753757476807, "reward_std": 0.27449485659599304, "rewards/rna_reward_fn/mean": 0.5778753757476807, "rewards/rna_reward_fn/std": 0.3692671060562134, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 135.46875, "completions/mean_terminated_length": 135.46875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.23743800073862076, "epoch": 0.27058823529411763, "frac_reward_zero_std": 0.0, "grad_norm": 0.36481839418411255, "learning_rate": 9.137254901960783e-07, "loss": -0.0, "num_tokens": 3466336.0, "reward": 0.6230462193489075, "reward_std": 0.27385085821151733, "rewards/rna_reward_fn/mean": 0.6230462193489075, "rewards/rna_reward_fn/std": 0.35384857654571533, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 159.25, "completions/mean_terminated_length": 159.25, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.2592047303915024, "epoch": 0.2823529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 0.40386804938316345, "learning_rate": 9.098039215686274e-07, "loss": -0.0, "num_tokens": 3630432.0, "reward": 0.587247908115387, "reward_std": 0.26836222410202026, "rewards/rna_reward_fn/mean": 0.587247908115387, "rewards/rna_reward_fn/std": 0.3811717927455902, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 152.375, "completions/mean_terminated_length": 152.375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.23664871603250504, "epoch": 0.29411764705882354, "frac_reward_zero_std": 0.0, "grad_norm": 0.514864444732666, "learning_rate": 9.058823529411764e-07, "loss": -0.0, "num_tokens": 3787488.0, "reward": 0.6044737696647644, "reward_std": 0.2556478679180145, "rewards/rna_reward_fn/mean": 0.6044737696647644, "rewards/rna_reward_fn/std": 0.3558889329433441, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 140.5, "completions/mean_terminated_length": 140.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 0.2437874600291252, "epoch": 0.3058823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 0.4290100038051605, "learning_rate": 9.019607843137255e-07, "loss": -0.0, "num_tokens": 3932384.0, "reward": 0.583857536315918, "reward_std": 0.2450568526983261, "rewards/rna_reward_fn/mean": 0.583857536315918, "rewards/rna_reward_fn/std": 0.3653680384159088, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 164.8125, "completions/mean_terminated_length": 164.8125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.24944818764925003, "epoch": 0.3176470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 0.42284926772117615, "learning_rate": 8.980392156862745e-07, "loss": -0.0, "num_tokens": 4102176.0, "reward": 0.5925735235214233, "reward_std": 0.2968187630176544, "rewards/rna_reward_fn/mean": 0.5925735235214233, "rewards/rna_reward_fn/std": 0.3608212471008301, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 146.1875, "completions/mean_terminated_length": 146.1875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 0.22080854326486588, "epoch": 0.32941176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 0.4605961740016937, "learning_rate": 8.941176470588236e-07, "loss": 0.0, "num_tokens": 4252896.0, "reward": 0.5584173202514648, "reward_std": 0.2890748083591461, "rewards/rna_reward_fn/mean": 0.5584173202514648, "rewards/rna_reward_fn/std": 0.3958645462989807, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 175.90625, "completions/mean_terminated_length": 175.90625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.2321019321680069, "epoch": 0.3411764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 0.5582552552223206, "learning_rate": 8.901960784313724e-07, "loss": 0.0, "num_tokens": 4434048.0, "reward": 0.5966294407844543, "reward_std": 0.2823025584220886, "rewards/rna_reward_fn/mean": 0.5966294407844543, "rewards/rna_reward_fn/std": 0.3560717701911926, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 174.1875, "completions/mean_terminated_length": 174.1875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 0.21510899811983109, "epoch": 0.35294117647058826, "frac_reward_zero_std": 0.0, "grad_norm": 0.49061208963394165, "learning_rate": 8.862745098039215e-07, "loss": -0.0, "num_tokens": 4613440.0, "reward": 0.5848400592803955, "reward_std": 0.267974317073822, "rewards/rna_reward_fn/mean": 0.5848400592803955, "rewards/rna_reward_fn/std": 0.37775954604148865, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 163.15625, "completions/mean_terminated_length": 163.15625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.2507341653108597, "epoch": 0.36470588235294116, "frac_reward_zero_std": 0.0, "grad_norm": 0.603717029094696, "learning_rate": 8.823529411764705e-07, "loss": 0.0, "num_tokens": 4781536.0, "reward": 0.6572607159614563, "reward_std": 0.2553848624229431, "rewards/rna_reward_fn/mean": 0.6572607159614563, "rewards/rna_reward_fn/std": 0.3443078398704529, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 170.34375, "completions/mean_terminated_length": 170.34375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.2254045456647873, "epoch": 0.3764705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 0.5129714608192444, "learning_rate": 8.784313725490196e-07, "loss": -0.0, "num_tokens": 4956992.0, "reward": 0.6237974762916565, "reward_std": 0.2781754732131958, "rewards/rna_reward_fn/mean": 0.6237974762916565, "rewards/rna_reward_fn/std": 0.37038782238960266, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 140.96875, "completions/mean_terminated_length": 140.96875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.23444515466690063, "epoch": 0.38823529411764707, "frac_reward_zero_std": 0.0, "grad_norm": 0.5718296766281128, "learning_rate": 8.745098039215686e-07, "loss": -0.0, "num_tokens": 5102368.0, "reward": 0.663845956325531, "reward_std": 0.23731249570846558, "rewards/rna_reward_fn/mean": 0.6638458967208862, "rewards/rna_reward_fn/std": 0.3386061191558838, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 135.84375, "completions/mean_terminated_length": 135.84375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.21551413834095, "epoch": 0.4, "frac_reward_zero_std": 0.0, "grad_norm": 0.48484402894973755, "learning_rate": 8.705882352941177e-07, "loss": 0.0, "num_tokens": 5242496.0, "reward": 0.5733575224876404, "reward_std": 0.2985653281211853, "rewards/rna_reward_fn/mean": 0.5733575224876404, "rewards/rna_reward_fn/std": 0.3665997385978699, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 128.34375, "completions/mean_terminated_length": 128.34375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.19232773780822754, "epoch": 0.4117647058823529, "frac_reward_zero_std": 0.0, "grad_norm": 0.3821423351764679, "learning_rate": 8.666666666666667e-07, "loss": 0.0, "num_tokens": 5374944.0, "reward": 0.6459628939628601, "reward_std": 0.27456825971603394, "rewards/rna_reward_fn/mean": 0.6459628939628601, "rewards/rna_reward_fn/std": 0.3492187559604645, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 117.28125, "completions/mean_terminated_length": 117.28125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.2170068845152855, "epoch": 0.4235294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 0.519378125667572, "learning_rate": 8.627450980392156e-07, "loss": -0.0, "num_tokens": 5496064.0, "reward": 0.6556386947631836, "reward_std": 0.2442726194858551, "rewards/rna_reward_fn/mean": 0.6556386947631836, "rewards/rna_reward_fn/std": 0.3574485182762146, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 143.75, "completions/mean_terminated_length": 143.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 0.23470622301101685, "epoch": 0.43529411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 0.4268864691257477, "learning_rate": 8.588235294117646e-07, "loss": 0.0, "num_tokens": 5644288.0, "reward": 0.6998727917671204, "reward_std": 0.2536011040210724, "rewards/rna_reward_fn/mean": 0.6998728513717651, "rewards/rna_reward_fn/std": 0.34483227133750916, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 161.84375, "completions/mean_terminated_length": 161.84375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.20661279559135437, "epoch": 0.4470588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 0.49551478028297424, "learning_rate": 8.549019607843136e-07, "loss": 0.0, "num_tokens": 5811040.0, "reward": 0.60715651512146, "reward_std": 0.2498263716697693, "rewards/rna_reward_fn/mean": 0.60715651512146, "rewards/rna_reward_fn/std": 0.3692743182182312, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 169.09375, "completions/mean_terminated_length": 169.09375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.22686513513326645, "epoch": 0.4588235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 0.539655864238739, "learning_rate": 8.509803921568627e-07, "loss": 0.0, "num_tokens": 5985216.0, "reward": 0.606254518032074, "reward_std": 0.27362608909606934, "rewards/rna_reward_fn/mean": 0.606254518032074, "rewards/rna_reward_fn/std": 0.37834590673446655, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 158.625, "completions/mean_terminated_length": 158.625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.20522872358560562, "epoch": 0.47058823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 0.4427753686904907, "learning_rate": 8.470588235294117e-07, "loss": 0.0, "num_tokens": 6148672.0, "reward": 0.6244011521339417, "reward_std": 0.2686484158039093, "rewards/rna_reward_fn/mean": 0.6244011521339417, "rewards/rna_reward_fn/std": 0.3721536099910736, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 165.5, "completions/mean_terminated_length": 165.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 0.22500251233577728, "epoch": 0.4823529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 0.8924270272254944, "learning_rate": 8.431372549019608e-07, "loss": 0.0, "num_tokens": 6319168.0, "reward": 0.5321128368377686, "reward_std": 0.29077643156051636, "rewards/rna_reward_fn/mean": 0.5321128368377686, "rewards/rna_reward_fn/std": 0.3840348422527313, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 160.90625, "completions/mean_terminated_length": 160.90625, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.23232445865869522, "epoch": 0.49411764705882355, "frac_reward_zero_std": 0.0, "grad_norm": 0.4376697540283203, "learning_rate": 8.392156862745098e-07, "loss": 0.0, "num_tokens": 6484960.0, "reward": 0.6353960037231445, "reward_std": 0.2474566251039505, "rewards/rna_reward_fn/mean": 0.6353960037231445, "rewards/rna_reward_fn/std": 0.3577839136123657, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 156.9375, "completions/mean_terminated_length": 156.9375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.21899814903736115, "epoch": 0.5058823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 0.5491610765457153, "learning_rate": 8.352941176470589e-07, "loss": -0.0, "num_tokens": 6646688.0, "reward": 0.6090617775917053, "reward_std": 0.2399156093597412, "rewards/rna_reward_fn/mean": 0.6090618371963501, "rewards/rna_reward_fn/std": 0.35401132702827454, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 161.28125, "completions/mean_terminated_length": 161.28125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.2018352746963501, "epoch": 0.5176470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 0.4728248715400696, "learning_rate": 8.313725490196078e-07, "loss": -0.0, "num_tokens": 6812864.0, "reward": 0.5414500832557678, "reward_std": 0.257457435131073, "rewards/rna_reward_fn/mean": 0.5414501428604126, "rewards/rna_reward_fn/std": 0.37554678320884705, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 144.53125, "completions/mean_terminated_length": 144.53125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.21590139716863632, "epoch": 0.5294117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 0.45613518357276917, "learning_rate": 8.274509803921567e-07, "loss": 0.0, "num_tokens": 6961888.0, "reward": 0.5840362310409546, "reward_std": 0.24920199811458588, "rewards/rna_reward_fn/mean": 0.5840362310409546, "rewards/rna_reward_fn/std": 0.3838988244533539, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 151.59375, "completions/mean_terminated_length": 151.59375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.20446214824914932, "epoch": 0.5411764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 0.4725431799888611, "learning_rate": 8.235294117647058e-07, "loss": 0.0, "num_tokens": 7118144.0, "reward": 0.5587388277053833, "reward_std": 0.25771480798721313, "rewards/rna_reward_fn/mean": 0.5587388277053833, "rewards/rna_reward_fn/std": 0.3881581127643585, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 148.09375, "completions/mean_terminated_length": 148.09375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.20715581625699997, "epoch": 0.5529411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 0.5775709748268127, "learning_rate": 8.196078431372548e-07, "loss": -0.0, "num_tokens": 7270816.0, "reward": 0.6535854935646057, "reward_std": 0.23074793815612793, "rewards/rna_reward_fn/mean": 0.6535854339599609, "rewards/rna_reward_fn/std": 0.35560858249664307, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 148.25, "completions/mean_terminated_length": 148.25, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.20631568133831024, "epoch": 0.5647058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 0.5872889161109924, "learning_rate": 8.156862745098039e-07, "loss": -0.0, "num_tokens": 7423648.0, "reward": 0.5795817375183105, "reward_std": 0.26122066378593445, "rewards/rna_reward_fn/mean": 0.5795817375183105, "rewards/rna_reward_fn/std": 0.3758288025856018, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 124.71875, "completions/mean_terminated_length": 124.71875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.19562938064336777, "epoch": 0.5764705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 0.450082391500473, "learning_rate": 8.117647058823529e-07, "loss": 0.0, "num_tokens": 7552384.0, "reward": 0.657599925994873, "reward_std": 0.24575895071029663, "rewards/rna_reward_fn/mean": 0.657599925994873, "rewards/rna_reward_fn/std": 0.31881189346313477, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 144.53125, "completions/mean_terminated_length": 144.53125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.212866373360157, "epoch": 0.5882352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 0.4694586992263794, "learning_rate": 8.07843137254902e-07, "loss": -0.0, "num_tokens": 7701408.0, "reward": 0.5784563422203064, "reward_std": 0.2643548846244812, "rewards/rna_reward_fn/mean": 0.5784563422203064, "rewards/rna_reward_fn/std": 0.3683941066265106, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 138.21875, "completions/mean_terminated_length": 138.21875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.17988762259483337, "epoch": 0.6, "frac_reward_zero_std": 0.0, "grad_norm": 0.44588983058929443, "learning_rate": 8.03921568627451e-07, "loss": 0.0, "num_tokens": 7843968.0, "reward": 0.6563807725906372, "reward_std": 0.2578202784061432, "rewards/rna_reward_fn/mean": 0.6563807725906372, "rewards/rna_reward_fn/std": 0.3404718339443207, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 181.0, "completions/mean_terminated_length": 181.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.22444826364517212, "epoch": 0.611764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 0.49978184700012207, "learning_rate": 8e-07, "loss": -0.0, "num_tokens": 8030336.0, "reward": 0.6426054239273071, "reward_std": 0.2517712712287903, "rewards/rna_reward_fn/mean": 0.6426054239273071, "rewards/rna_reward_fn/std": 0.3629717528820038, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 185.40625, "completions/mean_terminated_length": 185.40625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.20722465217113495, "epoch": 0.6235294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 0.6321276426315308, "learning_rate": 7.960784313725489e-07, "loss": -0.0, "num_tokens": 8221216.0, "reward": 0.7105848789215088, "reward_std": 0.23574814200401306, "rewards/rna_reward_fn/mean": 0.7105848789215088, "rewards/rna_reward_fn/std": 0.3385322690010071, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 148.125, "completions/mean_terminated_length": 148.125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.19676074385643005, "epoch": 0.6352941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 0.49395662546157837, "learning_rate": 7.92156862745098e-07, "loss": 0.0, "num_tokens": 8373920.0, "reward": 0.5770894885063171, "reward_std": 0.2644929885864258, "rewards/rna_reward_fn/mean": 0.5770894289016724, "rewards/rna_reward_fn/std": 0.3790797293186188, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 159.6875, "completions/mean_terminated_length": 159.6875, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.18705828487873077, "epoch": 0.6470588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 0.4197390079498291, "learning_rate": 7.88235294117647e-07, "loss": 0.0, "num_tokens": 8538464.0, "reward": 0.5764464139938354, "reward_std": 0.21550722420215607, "rewards/rna_reward_fn/mean": 0.5764464139938354, "rewards/rna_reward_fn/std": 0.364503413438797, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 146.125, "completions/mean_terminated_length": 146.125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 0.21118487417697906, "epoch": 0.6588235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 0.37146326899528503, "learning_rate": 7.84313725490196e-07, "loss": 0.0, "num_tokens": 8689120.0, "reward": 0.6104137897491455, "reward_std": 0.23754771053791046, "rewards/rna_reward_fn/mean": 0.6104137897491455, "rewards/rna_reward_fn/std": 0.3665221333503723, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 160.65625, "completions/mean_terminated_length": 160.65625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.1945827156305313, "epoch": 0.6705882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 0.4072308838367462, "learning_rate": 7.803921568627451e-07, "loss": 0.0, "num_tokens": 8854656.0, "reward": 0.6713041067123413, "reward_std": 0.2212895005941391, "rewards/rna_reward_fn/mean": 0.6713041067123413, "rewards/rna_reward_fn/std": 0.3392506539821625, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 142.125, "completions/mean_terminated_length": 142.125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 0.18257632106542587, "epoch": 0.6823529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 0.4942987561225891, "learning_rate": 7.764705882352941e-07, "loss": 0.0, "num_tokens": 9001216.0, "reward": 0.6629120707511902, "reward_std": 0.22726097702980042, "rewards/rna_reward_fn/mean": 0.6629120707511902, "rewards/rna_reward_fn/std": 0.31348657608032227, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 147.03125, "completions/mean_terminated_length": 147.03125, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 0.20158874243497849, "epoch": 0.6941176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 0.5187806487083435, "learning_rate": 7.725490196078432e-07, "loss": 0.0, "num_tokens": 9152800.0, "reward": 0.6476730108261108, "reward_std": 0.24552714824676514, "rewards/rna_reward_fn/mean": 0.6476730108261108, "rewards/rna_reward_fn/std": 0.33643367886543274, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 159.4375, "completions/mean_terminated_length": 159.4375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 0.18591003119945526, "epoch": 0.7058823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 0.36044302582740784, "learning_rate": 7.686274509803921e-07, "loss": 0.0, "num_tokens": 9317088.0, "reward": 0.6832787394523621, "reward_std": 0.22806429862976074, "rewards/rna_reward_fn/mean": 0.6832787394523621, "rewards/rna_reward_fn/std": 0.32348689436912537, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 160.96875, "completions/mean_terminated_length": 160.96875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.21002116054296494, "epoch": 0.7176470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 0.5378114581108093, "learning_rate": 7.647058823529411e-07, "loss": -0.0, "num_tokens": 9482944.0, "reward": 0.6531599760055542, "reward_std": 0.22567519545555115, "rewards/rna_reward_fn/mean": 0.653160035610199, "rewards/rna_reward_fn/std": 0.33769848942756653, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 116.75, "completions/mean_terminated_length": 116.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.18150582909584045, "epoch": 0.7294117647058823, "frac_reward_zero_std": 0.0, "grad_norm": 0.39785146713256836, "learning_rate": 7.607843137254901e-07, "loss": -0.0, "num_tokens": 9603520.0, "reward": 0.565564751625061, "reward_std": 0.2807776927947998, "rewards/rna_reward_fn/mean": 0.565564751625061, "rewards/rna_reward_fn/std": 0.38936248421669006, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 147.78125, "completions/mean_terminated_length": 147.78125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 0.189855195581913, "epoch": 0.7411764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 0.4473720192909241, "learning_rate": 7.568627450980392e-07, "loss": -0.0, "num_tokens": 9755872.0, "reward": 0.6822654008865356, "reward_std": 0.23419374227523804, "rewards/rna_reward_fn/mean": 0.6822654008865356, "rewards/rna_reward_fn/std": 0.32637539505958557, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 171.28125, "completions/mean_terminated_length": 171.28125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 0.19365741312503815, "epoch": 0.7529411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 0.5170744061470032, "learning_rate": 7.529411764705882e-07, "loss": -0.0, "num_tokens": 9932288.0, "reward": 0.6570923328399658, "reward_std": 0.24268731474876404, "rewards/rna_reward_fn/mean": 0.6570923328399658, "rewards/rna_reward_fn/std": 0.3360862731933594, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 138.5625, "completions/mean_terminated_length": 138.5625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.15700556337833405, "epoch": 0.7647058823529411, "frac_reward_zero_std": 0.0, "grad_norm": 0.669632077217102, "learning_rate": 7.490196078431373e-07, "loss": -0.0, "num_tokens": 10075200.0, "reward": 0.5884541273117065, "reward_std": 0.25077739357948303, "rewards/rna_reward_fn/mean": 0.5884541869163513, "rewards/rna_reward_fn/std": 0.3707042634487152, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 147.875, "completions/mean_terminated_length": 147.875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.1868809014558792, "epoch": 0.7764705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 0.496881365776062, "learning_rate": 7.450980392156863e-07, "loss": -0.0, "num_tokens": 10227648.0, "reward": 0.6189287900924683, "reward_std": 0.23646032810211182, "rewards/rna_reward_fn/mean": 0.6189287900924683, "rewards/rna_reward_fn/std": 0.3614950180053711, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 127.75, "completions/mean_terminated_length": 127.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.17434925585985184, "epoch": 0.788235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 0.5550652742385864, "learning_rate": 7.411764705882352e-07, "loss": 0.0, "num_tokens": 10359488.0, "reward": 0.5918734073638916, "reward_std": 0.2727334499359131, "rewards/rna_reward_fn/mean": 0.5918734073638916, "rewards/rna_reward_fn/std": 0.35672324895858765, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 151.96875, "completions/mean_terminated_length": 151.96875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 0.17505493760108948, "epoch": 0.8, "frac_reward_zero_std": 0.0, "grad_norm": 0.3833948075771332, "learning_rate": 7.372549019607843e-07, "loss": -0.0, "num_tokens": 10516128.0, "reward": 0.7000205516815186, "reward_std": 0.23740704357624054, "rewards/rna_reward_fn/mean": 0.7000205516815186, "rewards/rna_reward_fn/std": 0.3234153985977173, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 141.15625, "completions/mean_terminated_length": 141.15625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.17628953605890274, "epoch": 0.8117647058823529, "frac_reward_zero_std": 0.0, "grad_norm": 0.3673928678035736, "learning_rate": 7.333333333333332e-07, "loss": 0.0, "num_tokens": 10661696.0, "reward": 0.6538941860198975, "reward_std": 0.19288064539432526, "rewards/rna_reward_fn/mean": 0.6538941860198975, "rewards/rna_reward_fn/std": 0.3515564203262329, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 195.53125, "completions/mean_terminated_length": 195.53125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.18974752724170685, "epoch": 0.8235294117647058, "frac_reward_zero_std": 0.0, "grad_norm": 0.719358503818512, "learning_rate": 7.294117647058823e-07, "loss": -0.0, "num_tokens": 10862944.0, "reward": 0.5886421203613281, "reward_std": 0.23114809393882751, "rewards/rna_reward_fn/mean": 0.5886421203613281, "rewards/rna_reward_fn/std": 0.36729925870895386, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 156.46875, "completions/mean_terminated_length": 156.46875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.17211396992206573, "epoch": 0.8352941176470589, "frac_reward_zero_std": 0.0, "grad_norm": 0.4566245377063751, "learning_rate": 7.254901960784313e-07, "loss": 0.0, "num_tokens": 11024192.0, "reward": 0.6206304430961609, "reward_std": 0.20096182823181152, "rewards/rna_reward_fn/mean": 0.6206304430961609, "rewards/rna_reward_fn/std": 0.3349648714065552, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 124.84375, "completions/mean_terminated_length": 124.84375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.16766826063394547, "epoch": 0.8470588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 0.4164656698703766, "learning_rate": 7.215686274509804e-07, "loss": -0.0, "num_tokens": 11153056.0, "reward": 0.6351762413978577, "reward_std": 0.2213377058506012, "rewards/rna_reward_fn/mean": 0.6351762413978577, "rewards/rna_reward_fn/std": 0.3493310809135437, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 129.65625, "completions/mean_terminated_length": 129.65625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 0.16023673117160797, "epoch": 0.8588235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 0.41499394178390503, "learning_rate": 7.176470588235294e-07, "loss": 0.0, "num_tokens": 11286848.0, "reward": 0.6752070784568787, "reward_std": 0.24617840349674225, "rewards/rna_reward_fn/mean": 0.6752070784568787, "rewards/rna_reward_fn/std": 0.34732139110565186, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 164.9375, "completions/mean_terminated_length": 164.9375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.18363939225673676, "epoch": 0.8705882352941177, "frac_reward_zero_std": 0.0, "grad_norm": 0.45577237010002136, "learning_rate": 7.137254901960785e-07, "loss": -0.0, "num_tokens": 11456768.0, "reward": 0.5772933959960938, "reward_std": 0.23847423493862152, "rewards/rna_reward_fn/mean": 0.5772933959960938, "rewards/rna_reward_fn/std": 0.3823261260986328, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 188.28125, "completions/mean_terminated_length": 188.28125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.1838960349559784, "epoch": 0.8823529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 0.5237012505531311, "learning_rate": 7.098039215686274e-07, "loss": 0.0, "num_tokens": 11650592.0, "reward": 0.6181286573410034, "reward_std": 0.2555590569972992, "rewards/rna_reward_fn/mean": 0.6181286573410034, "rewards/rna_reward_fn/std": 0.37019652128219604, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 182.1875, "completions/mean_terminated_length": 182.1875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.1790659874677658, "epoch": 0.8941176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 0.4818723499774933, "learning_rate": 7.058823529411765e-07, "loss": -0.0, "num_tokens": 11838176.0, "reward": 0.578412652015686, "reward_std": 0.22860457003116608, "rewards/rna_reward_fn/mean": 0.578412652015686, "rewards/rna_reward_fn/std": 0.35265785455703735, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 200.21875, "completions/mean_terminated_length": 200.21875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 0.18565233796834946, "epoch": 0.9058823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 0.7948216795921326, "learning_rate": 7.019607843137254e-07, "loss": 0.0, "num_tokens": 12044224.0, "reward": 0.6187993288040161, "reward_std": 0.2622474431991577, "rewards/rna_reward_fn/mean": 0.6187993288040161, "rewards/rna_reward_fn/std": 0.326750248670578, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 145.8125, "completions/mean_terminated_length": 145.8125, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.17154797911643982, "epoch": 0.9176470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 0.47565799951553345, "learning_rate": 6.980392156862744e-07, "loss": -0.0, "num_tokens": 12194560.0, "reward": 0.5971746444702148, "reward_std": 0.18512360751628876, "rewards/rna_reward_fn/mean": 0.5971747040748596, "rewards/rna_reward_fn/std": 0.3710518777370453, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 128.71875, "completions/mean_terminated_length": 128.71875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.15196984261274338, "epoch": 0.9294117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 0.4943343698978424, "learning_rate": 6.941176470588235e-07, "loss": -0.0, "num_tokens": 12327392.0, "reward": 0.6471496820449829, "reward_std": 0.22329822182655334, "rewards/rna_reward_fn/mean": 0.6471496820449829, "rewards/rna_reward_fn/std": 0.33536407351493835, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 137.84375, "completions/mean_terminated_length": 137.84375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 0.16948848217725754, "epoch": 0.9411764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 0.4759957492351532, "learning_rate": 6.901960784313725e-07, "loss": -0.0, "num_tokens": 12469568.0, "reward": 0.659608006477356, "reward_std": 0.18602336943149567, "rewards/rna_reward_fn/mean": 0.659608006477356, "rewards/rna_reward_fn/std": 0.3731914460659027, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 146.75, "completions/mean_terminated_length": 146.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.18501683324575424, "epoch": 0.9529411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 0.43785735964775085, "learning_rate": 6.862745098039216e-07, "loss": 0.0, "num_tokens": 12620864.0, "reward": 0.620478630065918, "reward_std": 0.22393935918807983, "rewards/rna_reward_fn/mean": 0.620478630065918, "rewards/rna_reward_fn/std": 0.35981276631355286, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 150.1875, "completions/mean_terminated_length": 150.1875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.1829531416296959, "epoch": 0.9647058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 0.4392038583755493, "learning_rate": 6.823529411764706e-07, "loss": 0.0, "num_tokens": 12775680.0, "reward": 0.6712214350700378, "reward_std": 0.2174052894115448, "rewards/rna_reward_fn/mean": 0.6712214946746826, "rewards/rna_reward_fn/std": 0.3370954990386963, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 141.8125, "completions/mean_terminated_length": 141.8125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.1686822921037674, "epoch": 0.9764705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 0.4484212398529053, "learning_rate": 6.784313725490196e-07, "loss": -0.0, "num_tokens": 12921920.0, "reward": 0.6464422345161438, "reward_std": 0.2250806838274002, "rewards/rna_reward_fn/mean": 0.6464422345161438, "rewards/rna_reward_fn/std": 0.3622319996356964, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 166.65625, "completions/mean_terminated_length": 166.65625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 0.17645781487226486, "epoch": 0.9882352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 0.7668079137802124, "learning_rate": 6.745098039215686e-07, "loss": 0.0, "num_tokens": 13093600.0, "reward": 0.6832870244979858, "reward_std": 0.25750601291656494, "rewards/rna_reward_fn/mean": 0.6832869648933411, "rewards/rna_reward_fn/std": 0.3430787920951843, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 167.96875, "completions/mean_terminated_length": 167.96875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.17668870836496353, "epoch": 1.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.43097105622291565, "learning_rate": 6.705882352941176e-07, "loss": 0.0, "num_tokens": 13266624.0, "reward": 0.5539568662643433, "reward_std": 0.22693298757076263, "rewards/rna_reward_fn/mean": 0.5539568066596985, "rewards/rna_reward_fn/std": 0.38347697257995605, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 182.3125, "completions/mean_terminated_length": 182.3125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 0.1827656850218773, "epoch": 1.011764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 0.5608375668525696, "learning_rate": 6.666666666666666e-07, "loss": 0.0, "num_tokens": 13454336.0, "reward": 0.7320628762245178, "reward_std": 0.22256582975387573, "rewards/rna_reward_fn/mean": 0.7320628762245178, "rewards/rna_reward_fn/std": 0.30846187472343445, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 140.625, "completions/mean_terminated_length": 140.625, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.18483393639326096, "epoch": 1.0235294117647058, "frac_reward_zero_std": 0.0, "grad_norm": 0.4667685627937317, "learning_rate": 6.627450980392156e-07, "loss": 0.0, "num_tokens": 13599360.0, "reward": 0.6894385814666748, "reward_std": 0.20523157715797424, "rewards/rna_reward_fn/mean": 0.6894385814666748, "rewards/rna_reward_fn/std": 0.3155847191810608, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 150.46875, "completions/mean_terminated_length": 150.46875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 0.16182925552129745, "epoch": 1.035294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 0.6056375503540039, "learning_rate": 6.588235294117647e-07, "loss": -0.0, "num_tokens": 13754464.0, "reward": 0.6177388429641724, "reward_std": 0.24611341953277588, "rewards/rna_reward_fn/mean": 0.6177388429641724, "rewards/rna_reward_fn/std": 0.3494950830936432, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 144.09375, "completions/mean_terminated_length": 144.09375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 0.17024414986371994, "epoch": 1.0470588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 0.4357620179653168, "learning_rate": 6.549019607843137e-07, "loss": -0.0, "num_tokens": 13903040.0, "reward": 0.611262857913971, "reward_std": 0.19428220391273499, "rewards/rna_reward_fn/mean": 0.611262857913971, "rewards/rna_reward_fn/std": 0.3793390393257141, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 132.5625, "completions/mean_terminated_length": 132.5625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 0.16757714748382568, "epoch": 1.0588235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 0.440759539604187, "learning_rate": 6.509803921568627e-07, "loss": -0.0, "num_tokens": 14039808.0, "reward": 0.6882448196411133, "reward_std": 0.19556942582130432, "rewards/rna_reward_fn/mean": 0.6882448196411133, "rewards/rna_reward_fn/std": 0.32508718967437744, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 143.78125, "completions/mean_terminated_length": 143.78125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.1645500287413597, "epoch": 1.0705882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 0.5613058805465698, "learning_rate": 6.470588235294117e-07, "loss": 0.0, "num_tokens": 14188064.0, "reward": 0.6789584159851074, "reward_std": 0.19199398159980774, "rewards/rna_reward_fn/mean": 0.6789584159851074, "rewards/rna_reward_fn/std": 0.3482169210910797, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 118.34375, "completions/mean_terminated_length": 118.34375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.14176590740680695, "epoch": 1.0823529411764705, "frac_reward_zero_std": 0.0, "grad_norm": 0.4092370867729187, "learning_rate": 6.431372549019608e-07, "loss": -0.0, "num_tokens": 14310272.0, "reward": 0.650740921497345, "reward_std": 0.18103614449501038, "rewards/rna_reward_fn/mean": 0.650740921497345, "rewards/rna_reward_fn/std": 0.32734215259552, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 154.3125, "completions/mean_terminated_length": 154.3125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 0.176346056163311, "epoch": 1.0941176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 0.4459090232849121, "learning_rate": 6.392156862745097e-07, "loss": 0.0, "num_tokens": 14469312.0, "reward": 0.6732466816902161, "reward_std": 0.22345304489135742, "rewards/rna_reward_fn/mean": 0.6732466816902161, "rewards/rna_reward_fn/std": 0.3369784951210022, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 145.3125, "completions/mean_terminated_length": 145.3125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.1685405969619751, "epoch": 1.1058823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 0.5043669939041138, "learning_rate": 6.352941176470588e-07, "loss": -0.0, "num_tokens": 14619136.0, "reward": 0.677271842956543, "reward_std": 0.20296773314476013, "rewards/rna_reward_fn/mean": 0.677271842956543, "rewards/rna_reward_fn/std": 0.320669025182724, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 170.1875, "completions/mean_terminated_length": 170.1875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.18431222438812256, "epoch": 1.1176470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 0.6736860275268555, "learning_rate": 6.313725490196078e-07, "loss": -0.0, "num_tokens": 14794432.0, "reward": 0.6684234738349915, "reward_std": 0.259125292301178, "rewards/rna_reward_fn/mean": 0.6684235334396362, "rewards/rna_reward_fn/std": 0.34210121631622314, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 157.6875, "completions/mean_terminated_length": 157.6875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.16836901009082794, "epoch": 1.1294117647058823, "frac_reward_zero_std": 0.0, "grad_norm": 0.4569699764251709, "learning_rate": 6.274509803921569e-07, "loss": -0.0, "num_tokens": 14956928.0, "reward": 0.68538498878479, "reward_std": 0.1874302327632904, "rewards/rna_reward_fn/mean": 0.68538498878479, "rewards/rna_reward_fn/std": 0.295845091342926, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 140.21875, "completions/mean_terminated_length": 140.21875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 0.158738911151886, "epoch": 1.1411764705882352, "frac_reward_zero_std": 0.0, "grad_norm": 0.4725809395313263, "learning_rate": 6.235294117647059e-07, "loss": 0.0, "num_tokens": 15101536.0, "reward": 0.6654532551765442, "reward_std": 0.18864062428474426, "rewards/rna_reward_fn/mean": 0.6654532551765442, "rewards/rna_reward_fn/std": 0.3371845781803131, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 162.8125, "completions/mean_terminated_length": 162.8125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.17738928645849228, "epoch": 1.1529411764705881, "frac_reward_zero_std": 0.0, "grad_norm": 0.5798309445381165, "learning_rate": 6.196078431372548e-07, "loss": -0.0, "num_tokens": 15269280.0, "reward": 0.7147358655929565, "reward_std": 0.21203583478927612, "rewards/rna_reward_fn/mean": 0.7147358655929565, "rewards/rna_reward_fn/std": 0.33255505561828613, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 168.03125, "completions/mean_terminated_length": 168.03125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.17116892337799072, "epoch": 1.1647058823529413, "frac_reward_zero_std": 0.0, "grad_norm": 0.5520632863044739, "learning_rate": 6.156862745098039e-07, "loss": -0.0, "num_tokens": 15442368.0, "reward": 0.6365219950675964, "reward_std": 0.20218491554260254, "rewards/rna_reward_fn/mean": 0.6365219950675964, "rewards/rna_reward_fn/std": 0.35175827145576477, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 138.0, "completions/mean_terminated_length": 138.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.17306677252054214, "epoch": 1.1764705882352942, "frac_reward_zero_std": 0.0, "grad_norm": 0.4389256238937378, "learning_rate": 6.117647058823529e-07, "loss": 0.0, "num_tokens": 15584704.0, "reward": 0.7388399839401245, "reward_std": 0.16607630252838135, "rewards/rna_reward_fn/mean": 0.7388399839401245, "rewards/rna_reward_fn/std": 0.2576732635498047, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 137.40625, "completions/mean_terminated_length": 137.40625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 0.15397901087999344, "epoch": 1.188235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 0.5594757795333862, "learning_rate": 6.078431372549019e-07, "loss": -0.0, "num_tokens": 15726432.0, "reward": 0.7157045602798462, "reward_std": 0.22128766775131226, "rewards/rna_reward_fn/mean": 0.7157045602798462, "rewards/rna_reward_fn/std": 0.2969537079334259, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 127.78125, "completions/mean_terminated_length": 127.78125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 0.17225481569766998, "epoch": 1.2, "frac_reward_zero_std": 0.0, "grad_norm": 0.40622541308403015, "learning_rate": 6.039215686274509e-07, "loss": -0.0, "num_tokens": 15858304.0, "reward": 0.7043038010597229, "reward_std": 0.22727924585342407, "rewards/rna_reward_fn/mean": 0.7043038606643677, "rewards/rna_reward_fn/std": 0.33978909254074097, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 167.625, "completions/mean_terminated_length": 167.625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 0.17464321851730347, "epoch": 1.2117647058823529, "frac_reward_zero_std": 0.0, "grad_norm": 0.4603181779384613, "learning_rate": 6e-07, "loss": -0.0, "num_tokens": 16030976.0, "reward": 0.61054527759552, "reward_std": 0.22179073095321655, "rewards/rna_reward_fn/mean": 0.61054527759552, "rewards/rna_reward_fn/std": 0.37210676074028015, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 156.8125, "completions/mean_terminated_length": 156.8125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 0.1658085659146309, "epoch": 1.223529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 0.4843849539756775, "learning_rate": 5.96078431372549e-07, "loss": -0.0, "num_tokens": 16192576.0, "reward": 0.6978532075881958, "reward_std": 0.1981123685836792, "rewards/rna_reward_fn/mean": 0.6978532671928406, "rewards/rna_reward_fn/std": 0.3141247630119324, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 181.1875, "completions/mean_terminated_length": 181.1875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.16212371736764908, "epoch": 1.2352941176470589, "frac_reward_zero_std": 0.0, "grad_norm": 0.5290284752845764, "learning_rate": 5.921568627450981e-07, "loss": 0.0, "num_tokens": 16379136.0, "reward": 0.6463083028793335, "reward_std": 0.1896321177482605, "rewards/rna_reward_fn/mean": 0.6463083028793335, "rewards/rna_reward_fn/std": 0.36457034945487976, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 124.3125, "completions/mean_terminated_length": 124.3125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 0.15162574499845505, "epoch": 1.2470588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 0.47445422410964966, "learning_rate": 5.88235294117647e-07, "loss": 0.0, "num_tokens": 16507456.0, "reward": 0.672465980052948, "reward_std": 0.20273976027965546, "rewards/rna_reward_fn/mean": 0.6724659204483032, "rewards/rna_reward_fn/std": 0.3352026343345642, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 154.25, "completions/mean_terminated_length": 154.25, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.1651393622159958, "epoch": 1.2588235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 0.48081472516059875, "learning_rate": 5.843137254901961e-07, "loss": -0.0, "num_tokens": 16666432.0, "reward": 0.6745295524597168, "reward_std": 0.21466964483261108, "rewards/rna_reward_fn/mean": 0.6745295524597168, "rewards/rna_reward_fn/std": 0.3604423701763153, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 176.34375, "completions/mean_terminated_length": 176.34375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.16943742334842682, "epoch": 1.2705882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 0.4931647777557373, "learning_rate": 5.803921568627451e-07, "loss": 0.0, "num_tokens": 16848032.0, "reward": 0.6875256896018982, "reward_std": 0.2435401976108551, "rewards/rna_reward_fn/mean": 0.6875256896018982, "rewards/rna_reward_fn/std": 0.3279384672641754, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 158.09375, "completions/mean_terminated_length": 158.09375, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.17465446144342422, "epoch": 1.2823529411764705, "frac_reward_zero_std": 0.0, "grad_norm": 0.5001822113990784, "learning_rate": 5.76470588235294e-07, "loss": 0.0, "num_tokens": 17010944.0, "reward": 0.6029446125030518, "reward_std": 0.1757221221923828, "rewards/rna_reward_fn/mean": 0.6029446125030518, "rewards/rna_reward_fn/std": 0.35652756690979004, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 167.40625, "completions/mean_terminated_length": 167.40625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 0.16541431099176407, "epoch": 1.2941176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 0.4689631760120392, "learning_rate": 5.725490196078431e-07, "loss": -0.0, "num_tokens": 17183392.0, "reward": 0.6704152226448059, "reward_std": 0.20997245609760284, "rewards/rna_reward_fn/mean": 0.6704152226448059, "rewards/rna_reward_fn/std": 0.32471874356269836, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 141.71875, "completions/mean_terminated_length": 141.71875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.1647869274020195, "epoch": 1.3058823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 0.5760033130645752, "learning_rate": 5.686274509803921e-07, "loss": -0.0, "num_tokens": 17329536.0, "reward": 0.6938682198524475, "reward_std": 0.20044496655464172, "rewards/rna_reward_fn/mean": 0.6938682198524475, "rewards/rna_reward_fn/std": 0.32881274819374084, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 115.96875, "completions/mean_terminated_length": 115.96875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.1390109360218048, "epoch": 1.3176470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 0.5902699828147888, "learning_rate": 5.647058823529412e-07, "loss": 0.0, "num_tokens": 17449312.0, "reward": 0.651271402835846, "reward_std": 0.17913030087947845, "rewards/rna_reward_fn/mean": 0.651271402835846, "rewards/rna_reward_fn/std": 0.3490009009838104, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 179.8125, "completions/mean_terminated_length": 179.8125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 0.16215970367193222, "epoch": 1.3294117647058823, "frac_reward_zero_std": 0.0, "grad_norm": 0.6261849403381348, "learning_rate": 5.607843137254902e-07, "loss": -0.0, "num_tokens": 17634464.0, "reward": 0.6400759220123291, "reward_std": 0.2095731794834137, "rewards/rna_reward_fn/mean": 0.6400759220123291, "rewards/rna_reward_fn/std": 0.34743088483810425, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 139.59375, "completions/mean_terminated_length": 139.59375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.17950539290905, "epoch": 1.3411764705882354, "frac_reward_zero_std": 0.0, "grad_norm": 0.4431358277797699, "learning_rate": 5.568627450980392e-07, "loss": 0.0, "num_tokens": 17778432.0, "reward": 0.7148804068565369, "reward_std": 0.19681406021118164, "rewards/rna_reward_fn/mean": 0.7148803472518921, "rewards/rna_reward_fn/std": 0.2995694577693939, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 167.6875, "completions/mean_terminated_length": 167.6875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.16394728422164917, "epoch": 1.3529411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 0.4245275557041168, "learning_rate": 5.529411764705882e-07, "loss": -0.0, "num_tokens": 17951168.0, "reward": 0.6865168213844299, "reward_std": 0.21481367945671082, "rewards/rna_reward_fn/mean": 0.6865168213844299, "rewards/rna_reward_fn/std": 0.3217703402042389, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 146.6875, "completions/mean_terminated_length": 146.6875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.16379400342702866, "epoch": 1.3647058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 0.7668678760528564, "learning_rate": 5.490196078431373e-07, "loss": -0.0, "num_tokens": 18102400.0, "reward": 0.7100426554679871, "reward_std": 0.20684288442134857, "rewards/rna_reward_fn/mean": 0.7100426554679871, "rewards/rna_reward_fn/std": 0.32808709144592285, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 147.40625, "completions/mean_terminated_length": 147.40625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.16369594633579254, "epoch": 1.3764705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 0.4491204619407654, "learning_rate": 5.450980392156862e-07, "loss": -0.0, "num_tokens": 18254368.0, "reward": 0.6345921754837036, "reward_std": 0.17989099025726318, "rewards/rna_reward_fn/mean": 0.6345921754837036, "rewards/rna_reward_fn/std": 0.3739507794380188, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 128.96875, "completions/mean_terminated_length": 128.96875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.16341928392648697, "epoch": 1.388235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 0.40218448638916016, "learning_rate": 5.411764705882353e-07, "loss": 0.0, "num_tokens": 18387456.0, "reward": 0.6973093748092651, "reward_std": 0.19106432795524597, "rewards/rna_reward_fn/mean": 0.6973093748092651, "rewards/rna_reward_fn/std": 0.328565388917923, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 185.3125, "completions/mean_terminated_length": 185.3125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.15643662959337234, "epoch": 1.4, "frac_reward_zero_std": 0.0, "grad_norm": 0.4641011953353882, "learning_rate": 5.372549019607843e-07, "loss": -0.0, "num_tokens": 18578240.0, "reward": 0.6982426643371582, "reward_std": 0.17999790608882904, "rewards/rna_reward_fn/mean": 0.6982426643371582, "rewards/rna_reward_fn/std": 0.3187488615512848, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 151.125, "completions/mean_terminated_length": 151.125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.16167542338371277, "epoch": 1.4117647058823528, "frac_reward_zero_std": 0.0, "grad_norm": 0.4710671305656433, "learning_rate": 5.333333333333333e-07, "loss": -0.0, "num_tokens": 18734016.0, "reward": 0.765220046043396, "reward_std": 0.16310608386993408, "rewards/rna_reward_fn/mean": 0.765220046043396, "rewards/rna_reward_fn/std": 0.30073776841163635, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 200.53125, "completions/mean_terminated_length": 200.53125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.17333289235830307, "epoch": 1.423529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 0.5605267882347107, "learning_rate": 5.294117647058823e-07, "loss": -0.0, "num_tokens": 18940384.0, "reward": 0.6207563877105713, "reward_std": 0.2605891227722168, "rewards/rna_reward_fn/mean": 0.6207563877105713, "rewards/rna_reward_fn/std": 0.35733622312545776, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 126.90625, "completions/mean_terminated_length": 126.90625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.16177111864089966, "epoch": 1.4352941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 0.5492433905601501, "learning_rate": 5.254901960784313e-07, "loss": 0.0, "num_tokens": 19071360.0, "reward": 0.6156597137451172, "reward_std": 0.2084151953458786, "rewards/rna_reward_fn/mean": 0.6156597137451172, "rewards/rna_reward_fn/std": 0.3588009178638458, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 126.15625, "completions/mean_terminated_length": 126.15625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.1655115783214569, "epoch": 1.4470588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 0.5015555024147034, "learning_rate": 5.215686274509804e-07, "loss": 0.0, "num_tokens": 19201568.0, "reward": 0.6790971755981445, "reward_std": 0.20820938050746918, "rewards/rna_reward_fn/mean": 0.6790972352027893, "rewards/rna_reward_fn/std": 0.33763545751571655, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 153.75, "completions/mean_terminated_length": 153.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.1595897227525711, "epoch": 1.4588235294117646, "frac_reward_zero_std": 0.0, "grad_norm": 0.5314822793006897, "learning_rate": 5.176470588235294e-07, "loss": 0.0, "num_tokens": 19360032.0, "reward": 0.6510605812072754, "reward_std": 0.18497204780578613, "rewards/rna_reward_fn/mean": 0.6510605812072754, "rewards/rna_reward_fn/std": 0.3650972247123718, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 134.5625, "completions/mean_terminated_length": 134.5625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.1490706205368042, "epoch": 1.4705882352941178, "frac_reward_zero_std": 0.0, "grad_norm": 0.5578471422195435, "learning_rate": 5.137254901960784e-07, "loss": -0.0, "num_tokens": 19498848.0, "reward": 0.6481872797012329, "reward_std": 0.19116738438606262, "rewards/rna_reward_fn/mean": 0.6481872797012329, "rewards/rna_reward_fn/std": 0.32832634449005127, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 186.0625, "completions/mean_terminated_length": 186.0625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.16315071284770966, "epoch": 1.4823529411764707, "frac_reward_zero_std": 0.0, "grad_norm": 0.47001388669013977, "learning_rate": 5.098039215686274e-07, "loss": 0.0, "num_tokens": 19690400.0, "reward": 0.6869475245475769, "reward_std": 0.21966272592544556, "rewards/rna_reward_fn/mean": 0.6869475245475769, "rewards/rna_reward_fn/std": 0.3061429262161255, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 159.25, "completions/mean_terminated_length": 159.25, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.1544899046421051, "epoch": 1.4941176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 0.7163305878639221, "learning_rate": 5.058823529411765e-07, "loss": 0.0, "num_tokens": 19854496.0, "reward": 0.7104751467704773, "reward_std": 0.17693877220153809, "rewards/rna_reward_fn/mean": 0.7104751467704773, "rewards/rna_reward_fn/std": 0.30990538001060486, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 134.6875, "completions/mean_terminated_length": 134.6875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.16278471052646637, "epoch": 1.5058823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 0.7567697167396545, "learning_rate": 5.019607843137255e-07, "loss": -0.0, "num_tokens": 19993440.0, "reward": 0.6815826296806335, "reward_std": 0.20137576758861542, "rewards/rna_reward_fn/mean": 0.6815826296806335, "rewards/rna_reward_fn/std": 0.32526591420173645, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 142.5625, "completions/mean_terminated_length": 142.5625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 0.16126833856105804, "epoch": 1.5176470588235293, "frac_reward_zero_std": 0.0, "grad_norm": 0.5958517789840698, "learning_rate": 4.980392156862744e-07, "loss": 0.0, "num_tokens": 20140448.0, "reward": 0.6496865153312683, "reward_std": 0.23397710919380188, "rewards/rna_reward_fn/mean": 0.6496865153312683, "rewards/rna_reward_fn/std": 0.3660079836845398, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 178.3125, "completions/mean_terminated_length": 178.3125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.16705547273159027, "epoch": 1.5294117647058822, "frac_reward_zero_std": 0.0, "grad_norm": 0.5045768618583679, "learning_rate": 4.941176470588235e-07, "loss": 0.0, "num_tokens": 20324064.0, "reward": 0.6084290146827698, "reward_std": 0.22301070392131805, "rewards/rna_reward_fn/mean": 0.608428955078125, "rewards/rna_reward_fn/std": 0.37412387132644653, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 178.21875, "completions/mean_terminated_length": 178.21875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.16225259751081467, "epoch": 1.5411764705882351, "frac_reward_zero_std": 0.0, "grad_norm": 0.4790975749492645, "learning_rate": 4.901960784313725e-07, "loss": -0.0, "num_tokens": 20507584.0, "reward": 0.6834284067153931, "reward_std": 0.16327084600925446, "rewards/rna_reward_fn/mean": 0.6834284067153931, "rewards/rna_reward_fn/std": 0.3331601321697235, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 124.46875, "completions/mean_terminated_length": 124.46875, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.14231518656015396, "epoch": 1.5529411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 0.45782116055488586, "learning_rate": 4.862745098039216e-07, "loss": -0.0, "num_tokens": 20636064.0, "reward": 0.6696175336837769, "reward_std": 0.1951877474784851, "rewards/rna_reward_fn/mean": 0.6696175336837769, "rewards/rna_reward_fn/std": 0.3469404876232147, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 153.09375, "completions/mean_terminated_length": 153.09375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.14148423075675964, "epoch": 1.5647058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 0.6516547203063965, "learning_rate": 4.823529411764705e-07, "loss": -0.0, "num_tokens": 20793856.0, "reward": 0.6711336374282837, "reward_std": 0.2223963439464569, "rewards/rna_reward_fn/mean": 0.6711336374282837, "rewards/rna_reward_fn/std": 0.3334668278694153, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 144.34375, "completions/mean_terminated_length": 144.34375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 0.1529795005917549, "epoch": 1.576470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 0.5148042440414429, "learning_rate": 4.784313725490196e-07, "loss": 0.0, "num_tokens": 20942688.0, "reward": 0.759110152721405, "reward_std": 0.16160593926906586, "rewards/rna_reward_fn/mean": 0.7591102123260498, "rewards/rna_reward_fn/std": 0.2931617796421051, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 108.34375, "completions/mean_terminated_length": 108.34375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 0.1443817839026451, "epoch": 1.5882352941176472, "frac_reward_zero_std": 0.0, "grad_norm": 0.42829352617263794, "learning_rate": 4.7450980392156857e-07, "loss": -0.0, "num_tokens": 21054656.0, "reward": 0.6639102697372437, "reward_std": 0.20781482756137848, "rewards/rna_reward_fn/mean": 0.6639102697372437, "rewards/rna_reward_fn/std": 0.3437131941318512, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 175.03125, "completions/mean_terminated_length": 175.03125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 0.15896137803792953, "epoch": 1.6, "frac_reward_zero_std": 0.0, "grad_norm": 0.5342750549316406, "learning_rate": 4.705882352941176e-07, "loss": 0.0, "num_tokens": 21234912.0, "reward": 0.6274444460868835, "reward_std": 0.22071924805641174, "rewards/rna_reward_fn/mean": 0.6274445056915283, "rewards/rna_reward_fn/std": 0.3473777174949646, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 143.65625, "completions/mean_terminated_length": 143.65625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.15408551692962646, "epoch": 1.611764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 0.49438202381134033, "learning_rate": 4.6666666666666666e-07, "loss": 0.0, "num_tokens": 21383040.0, "reward": 0.6316537857055664, "reward_std": 0.1621330976486206, "rewards/rna_reward_fn/mean": 0.6316537857055664, "rewards/rna_reward_fn/std": 0.34947502613067627, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 168.84375, "completions/mean_terminated_length": 168.84375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 0.17249725759029388, "epoch": 1.6235294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 0.5168977379798889, "learning_rate": 4.627450980392157e-07, "loss": -0.0, "num_tokens": 21556960.0, "reward": 0.7472211122512817, "reward_std": 0.16369092464447021, "rewards/rna_reward_fn/mean": 0.7472211122512817, "rewards/rna_reward_fn/std": 0.27173811197280884, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 157.1875, "completions/mean_terminated_length": 157.1875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 0.16690535098314285, "epoch": 1.6352941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 0.5558773875236511, "learning_rate": 4.5882352941176465e-07, "loss": 0.0, "num_tokens": 21718944.0, "reward": 0.6854004859924316, "reward_std": 0.19929495453834534, "rewards/rna_reward_fn/mean": 0.6854004859924316, "rewards/rna_reward_fn/std": 0.31646665930747986, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 152.59375, "completions/mean_terminated_length": 152.59375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.1484585627913475, "epoch": 1.6470588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 0.9384368062019348, "learning_rate": 4.549019607843137e-07, "loss": -0.0, "num_tokens": 21876224.0, "reward": 0.6835744380950928, "reward_std": 0.1949320137500763, "rewards/rna_reward_fn/mean": 0.6835744380950928, "rewards/rna_reward_fn/std": 0.35554417967796326, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 127.875, "completions/mean_terminated_length": 127.875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.14056292921304703, "epoch": 1.6588235294117646, "frac_reward_zero_std": 0.0, "grad_norm": 0.4758838713169098, "learning_rate": 4.5098039215686274e-07, "loss": 0.0, "num_tokens": 22008192.0, "reward": 0.7035012245178223, "reward_std": 0.18292057514190674, "rewards/rna_reward_fn/mean": 0.703501284122467, "rewards/rna_reward_fn/std": 0.29926764965057373, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 164.59375, "completions/mean_terminated_length": 164.59375, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.1475282907485962, "epoch": 1.6705882352941175, "frac_reward_zero_std": 0.0, "grad_norm": 0.5269675254821777, "learning_rate": 4.470588235294118e-07, "loss": -0.0, "num_tokens": 22177760.0, "reward": 0.724274754524231, "reward_std": 0.20411115884780884, "rewards/rna_reward_fn/mean": 0.724274754524231, "rewards/rna_reward_fn/std": 0.29461607336997986, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 166.09375, "completions/mean_terminated_length": 166.09375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.14830049872398376, "epoch": 1.6823529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 0.5128397345542908, "learning_rate": 4.4313725490196073e-07, "loss": 0.0, "num_tokens": 22348864.0, "reward": 0.6864579916000366, "reward_std": 0.18042539060115814, "rewards/rna_reward_fn/mean": 0.6864579916000366, "rewards/rna_reward_fn/std": 0.3156171441078186, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 121.21875, "completions/mean_terminated_length": 121.21875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.14306584745645523, "epoch": 1.6941176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 0.4526241421699524, "learning_rate": 4.392156862745098e-07, "loss": 0.0, "num_tokens": 22474016.0, "reward": 0.6906402111053467, "reward_std": 0.2201388031244278, "rewards/rna_reward_fn/mean": 0.6906402111053467, "rewards/rna_reward_fn/std": 0.3415301740169525, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 111.0625, "completions/mean_terminated_length": 111.0625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.14087412506341934, "epoch": 1.7058823529411766, "frac_reward_zero_std": 0.0, "grad_norm": 0.4583019018173218, "learning_rate": 4.352941176470588e-07, "loss": 0.0, "num_tokens": 22588768.0, "reward": 0.7702864408493042, "reward_std": 0.1817162036895752, "rewards/rna_reward_fn/mean": 0.7702864408493042, "rewards/rna_reward_fn/std": 0.28576594591140747, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 152.46875, "completions/mean_terminated_length": 152.46875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.13646821677684784, "epoch": 1.7176470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 0.5821676850318909, "learning_rate": 4.313725490196078e-07, "loss": -0.0, "num_tokens": 22745920.0, "reward": 0.6735475659370422, "reward_std": 0.2079792022705078, "rewards/rna_reward_fn/mean": 0.6735475659370422, "rewards/rna_reward_fn/std": 0.34127116203308105, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 137.0625, "completions/mean_terminated_length": 137.0625, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.1294446587562561, "epoch": 1.7294117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 0.47053244709968567, "learning_rate": 4.274509803921568e-07, "loss": 0.0, "num_tokens": 22887296.0, "reward": 0.7310217618942261, "reward_std": 0.16372641921043396, "rewards/rna_reward_fn/mean": 0.7310217618942261, "rewards/rna_reward_fn/std": 0.29399389028549194, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 164.03125, "completions/mean_terminated_length": 164.03125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.16281016170978546, "epoch": 1.7411764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 0.5588626861572266, "learning_rate": 4.2352941176470586e-07, "loss": 0.0, "num_tokens": 23056288.0, "reward": 0.654833197593689, "reward_std": 0.1884084939956665, "rewards/rna_reward_fn/mean": 0.654833197593689, "rewards/rna_reward_fn/std": 0.3517378270626068, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 140.84375, "completions/mean_terminated_length": 140.84375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 0.15908341854810715, "epoch": 1.7529411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 0.5507121086120605, "learning_rate": 4.196078431372549e-07, "loss": 0.0, "num_tokens": 23201536.0, "reward": 0.699113667011261, "reward_std": 0.20187973976135254, "rewards/rna_reward_fn/mean": 0.699113667011261, "rewards/rna_reward_fn/std": 0.3249177634716034, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 192.4375, "completions/mean_terminated_length": 192.4375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.15749355405569077, "epoch": 1.7647058823529411, "frac_reward_zero_std": 0.0, "grad_norm": 0.47758468985557556, "learning_rate": 4.156862745098039e-07, "loss": 0.0, "num_tokens": 23399616.0, "reward": 0.6602087020874023, "reward_std": 0.2426632046699524, "rewards/rna_reward_fn/mean": 0.6602087020874023, "rewards/rna_reward_fn/std": 0.3394790291786194, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 186.375, "completions/mean_terminated_length": 186.375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.1590714380145073, "epoch": 1.776470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 0.5084402561187744, "learning_rate": 4.117647058823529e-07, "loss": 0.0, "num_tokens": 23591488.0, "reward": 0.6650402545928955, "reward_std": 0.18303653597831726, "rewards/rna_reward_fn/mean": 0.6650401949882507, "rewards/rna_reward_fn/std": 0.33965203166007996, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 141.40625, "completions/mean_terminated_length": 141.40625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.14213567227125168, "epoch": 1.788235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 0.5413779020309448, "learning_rate": 4.0784313725490194e-07, "loss": -0.0, "num_tokens": 23737312.0, "reward": 0.6437839865684509, "reward_std": 0.2132418155670166, "rewards/rna_reward_fn/mean": 0.6437839865684509, "rewards/rna_reward_fn/std": 0.3476622402667999, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 140.75, "completions/mean_terminated_length": 140.75, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.14729295670986176, "epoch": 1.8, "frac_reward_zero_std": 0.03125, "grad_norm": 0.48154816031455994, "learning_rate": 4.03921568627451e-07, "loss": -0.0, "num_tokens": 23882464.0, "reward": 0.6620033979415894, "reward_std": 0.22405345737934113, "rewards/rna_reward_fn/mean": 0.6620033979415894, "rewards/rna_reward_fn/std": 0.3390491306781769, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 166.46875, "completions/mean_terminated_length": 166.46875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.14903101325035095, "epoch": 1.811764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 0.625751793384552, "learning_rate": 4e-07, "loss": -0.0, "num_tokens": 24053952.0, "reward": 0.6442551612854004, "reward_std": 0.17395520210266113, "rewards/rna_reward_fn/mean": 0.6442551612854004, "rewards/rna_reward_fn/std": 0.3670194745063782, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 157.90625, "completions/mean_terminated_length": 157.90625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.15323904901742935, "epoch": 1.8235294117647058, "frac_reward_zero_std": 0.0, "grad_norm": 0.48200494050979614, "learning_rate": 3.96078431372549e-07, "loss": -0.0, "num_tokens": 24216672.0, "reward": 0.6359031200408936, "reward_std": 0.17717690765857697, "rewards/rna_reward_fn/mean": 0.6359031200408936, "rewards/rna_reward_fn/std": 0.32817214727401733, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 145.8125, "completions/mean_terminated_length": 145.8125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 0.1613752394914627, "epoch": 1.835294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 0.46832966804504395, "learning_rate": 3.92156862745098e-07, "loss": 0.0, "num_tokens": 24367008.0, "reward": 0.7130154371261597, "reward_std": 0.18193909525871277, "rewards/rna_reward_fn/mean": 0.7130154371261597, "rewards/rna_reward_fn/std": 0.3411928117275238, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 142.46875, "completions/mean_terminated_length": 142.46875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 0.13961906731128693, "epoch": 1.8470588235294119, "frac_reward_zero_std": 0.0, "grad_norm": 0.6261844038963318, "learning_rate": 3.8823529411764707e-07, "loss": -0.0, "num_tokens": 24513920.0, "reward": 0.711245596408844, "reward_std": 0.1767653077840805, "rewards/rna_reward_fn/mean": 0.7112456560134888, "rewards/rna_reward_fn/std": 0.3348366618156433, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 152.4375, "completions/mean_terminated_length": 152.4375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.1567898690700531, "epoch": 1.8588235294117648, "frac_reward_zero_std": 0.0, "grad_norm": 0.5200847387313843, "learning_rate": 3.8431372549019606e-07, "loss": -0.0, "num_tokens": 24671040.0, "reward": 0.7147434949874878, "reward_std": 0.14905846118927002, "rewards/rna_reward_fn/mean": 0.7147434949874878, "rewards/rna_reward_fn/std": 0.3070945739746094, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 125.71875, "completions/mean_terminated_length": 125.71875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.133110411465168, "epoch": 1.8705882352941177, "frac_reward_zero_std": 0.0, "grad_norm": 0.4239906370639801, "learning_rate": 3.8039215686274506e-07, "loss": 0.0, "num_tokens": 24800800.0, "reward": 0.640139639377594, "reward_std": 0.20033451914787292, "rewards/rna_reward_fn/mean": 0.640139639377594, "rewards/rna_reward_fn/std": 0.3294910490512848, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 134.8125, "completions/mean_terminated_length": 134.8125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.12187084183096886, "epoch": 1.8823529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 0.38697147369384766, "learning_rate": 3.764705882352941e-07, "loss": -0.0, "num_tokens": 24939872.0, "reward": 0.6659330725669861, "reward_std": 0.16438628733158112, "rewards/rna_reward_fn/mean": 0.6659330725669861, "rewards/rna_reward_fn/std": 0.35713815689086914, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 135.5625, "completions/mean_terminated_length": 135.5625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.13703680038452148, "epoch": 1.8941176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 0.4564237594604492, "learning_rate": 3.7254901960784315e-07, "loss": 0.0, "num_tokens": 25079712.0, "reward": 0.6596216559410095, "reward_std": 0.20437049865722656, "rewards/rna_reward_fn/mean": 0.6596216559410095, "rewards/rna_reward_fn/std": 0.3517865240573883, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 177.0625, "completions/mean_terminated_length": 177.0625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.15036547183990479, "epoch": 1.9058823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 0.45348137617111206, "learning_rate": 3.6862745098039214e-07, "loss": -0.0, "num_tokens": 25262048.0, "reward": 0.6836435198783875, "reward_std": 0.20624709129333496, "rewards/rna_reward_fn/mean": 0.6836435198783875, "rewards/rna_reward_fn/std": 0.32797813415527344, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 141.71875, "completions/mean_terminated_length": 141.71875, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.14257021248340607, "epoch": 1.9176470588235293, "frac_reward_zero_std": 0.0, "grad_norm": 0.4581199586391449, "learning_rate": 3.6470588235294114e-07, "loss": -0.0, "num_tokens": 25408192.0, "reward": 0.6231480836868286, "reward_std": 0.20732316374778748, "rewards/rna_reward_fn/mean": 0.6231480836868286, "rewards/rna_reward_fn/std": 0.35448968410491943, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 103.90625, "completions/mean_terminated_length": 103.90625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 0.11931119486689568, "epoch": 1.9294117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 0.42869991064071655, "learning_rate": 3.607843137254902e-07, "loss": -0.0, "num_tokens": 25515616.0, "reward": 0.7718137502670288, "reward_std": 0.15544265508651733, "rewards/rna_reward_fn/mean": 0.7718137502670288, "rewards/rna_reward_fn/std": 0.2820202112197876, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 118.34375, "completions/mean_terminated_length": 118.34375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.13630840182304382, "epoch": 1.9411764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 0.4796566069126129, "learning_rate": 3.5686274509803923e-07, "loss": 0.0, "num_tokens": 25637824.0, "reward": 0.7639800310134888, "reward_std": 0.16217514872550964, "rewards/rna_reward_fn/mean": 0.7639800310134888, "rewards/rna_reward_fn/std": 0.2800072729587555, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 196.1875, "completions/mean_terminated_length": 196.1875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.1692701205611229, "epoch": 1.9529411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 0.576678991317749, "learning_rate": 3.529411764705882e-07, "loss": 0.0, "num_tokens": 25839744.0, "reward": 0.62703537940979, "reward_std": 0.24643635749816895, "rewards/rna_reward_fn/mean": 0.62703537940979, "rewards/rna_reward_fn/std": 0.3669246435165405, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 167.96875, "completions/mean_terminated_length": 167.96875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.16024480760097504, "epoch": 1.9647058823529413, "frac_reward_zero_std": 0.0, "grad_norm": 0.7311699390411377, "learning_rate": 3.490196078431372e-07, "loss": 0.0, "num_tokens": 26012768.0, "reward": 0.6588948369026184, "reward_std": 0.1576000452041626, "rewards/rna_reward_fn/mean": 0.6588948965072632, "rewards/rna_reward_fn/std": 0.32907265424728394, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 201.5, "completions/mean_terminated_length": 201.5, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.1511036530137062, "epoch": 1.9764705882352942, "frac_reward_zero_std": 0.0, "grad_norm": 0.4694945216178894, "learning_rate": 3.4509803921568627e-07, "loss": 0.0, "num_tokens": 26220128.0, "reward": 0.6976197957992554, "reward_std": 0.19369524717330933, "rewards/rna_reward_fn/mean": 0.6976197957992554, "rewards/rna_reward_fn/std": 0.32611048221588135, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 154.5, "completions/mean_terminated_length": 154.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.15085221827030182, "epoch": 1.988235294117647, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7034254670143127, "learning_rate": 3.411764705882353e-07, "loss": 0.0, "num_tokens": 26379360.0, "reward": 0.6942508220672607, "reward_std": 0.20178331434726715, "rewards/rna_reward_fn/mean": 0.6942508220672607, "rewards/rna_reward_fn/std": 0.31030499935150146, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 160.53125, "completions/mean_terminated_length": 160.53125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.15548591315746307, "epoch": 2.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.5434289574623108, "learning_rate": 3.372549019607843e-07, "loss": -0.0, "num_tokens": 26544768.0, "reward": 0.6601583957672119, "reward_std": 0.15550854802131653, "rewards/rna_reward_fn/mean": 0.6601583361625671, "rewards/rna_reward_fn/std": 0.3311554193496704, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 160.40625, "completions/mean_terminated_length": 160.40625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.1544594094157219, "epoch": 2.011764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 0.6815203428268433, "learning_rate": 3.333333333333333e-07, "loss": 0.0, "num_tokens": 26710048.0, "reward": 0.5972940921783447, "reward_std": 0.18555977940559387, "rewards/rna_reward_fn/mean": 0.5972940921783447, "rewards/rna_reward_fn/std": 0.36445632576942444, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 157.40625, "completions/mean_terminated_length": 157.40625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 0.14051128178834915, "epoch": 2.023529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 0.5093562602996826, "learning_rate": 3.2941176470588235e-07, "loss": 0.0, "num_tokens": 26872256.0, "reward": 0.6649138927459717, "reward_std": 0.2001783400774002, "rewards/rna_reward_fn/mean": 0.6649138331413269, "rewards/rna_reward_fn/std": 0.3582386374473572, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 173.75, "completions/mean_terminated_length": 173.75, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.14279819279909134, "epoch": 2.0352941176470587, "frac_reward_zero_std": 0.0, "grad_norm": 0.4454724192619324, "learning_rate": 3.2549019607843134e-07, "loss": -0.0, "num_tokens": 27051200.0, "reward": 0.7748029828071594, "reward_std": 0.14138856530189514, "rewards/rna_reward_fn/mean": 0.7748030424118042, "rewards/rna_reward_fn/std": 0.2777082026004791, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 165.8125, "completions/mean_terminated_length": 165.8125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.13190212100744247, "epoch": 2.0470588235294116, "frac_reward_zero_std": 0.0, "grad_norm": 0.4604037404060364, "learning_rate": 3.215686274509804e-07, "loss": 0.0, "num_tokens": 27222016.0, "reward": 0.6792135238647461, "reward_std": 0.17050443589687347, "rewards/rna_reward_fn/mean": 0.6792135834693909, "rewards/rna_reward_fn/std": 0.3469991087913513, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 140.21875, "completions/mean_terminated_length": 140.21875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.11882514134049416, "epoch": 2.0588235294117645, "frac_reward_zero_std": 0.0, "grad_norm": 0.42415928840637207, "learning_rate": 3.176470588235294e-07, "loss": -0.0, "num_tokens": 27366624.0, "reward": 0.618835985660553, "reward_std": 0.19730809330940247, "rewards/rna_reward_fn/mean": 0.6188360452651978, "rewards/rna_reward_fn/std": 0.3514353334903717, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 154.25, "completions/mean_terminated_length": 154.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 0.12727607041597366, "epoch": 2.070588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 0.5830354690551758, "learning_rate": 3.1372549019607843e-07, "loss": 0.0, "num_tokens": 27525600.0, "reward": 0.6785444617271423, "reward_std": 0.18948182463645935, "rewards/rna_reward_fn/mean": 0.6785444617271423, "rewards/rna_reward_fn/std": 0.3351566791534424, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 147.78125, "completions/mean_terminated_length": 147.78125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 0.14719800651073456, "epoch": 2.0823529411764707, "frac_reward_zero_std": 0.03125, "grad_norm": 0.4794676899909973, "learning_rate": 3.098039215686274e-07, "loss": 0.0, "num_tokens": 27677952.0, "reward": 0.7077100276947021, "reward_std": 0.1931176781654358, "rewards/rna_reward_fn/mean": 0.7077100276947021, "rewards/rna_reward_fn/std": 0.3137640357017517, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 142.46875, "completions/mean_terminated_length": 142.46875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.15307611972093582, "epoch": 2.0941176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 0.6268736720085144, "learning_rate": 3.0588235294117647e-07, "loss": 0.0, "num_tokens": 27824864.0, "reward": 0.7079458236694336, "reward_std": 0.2219894826412201, "rewards/rna_reward_fn/mean": 0.7079458236694336, "rewards/rna_reward_fn/std": 0.3472329080104828, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 164.0, "completions/mean_terminated_length": 164.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.13749201595783234, "epoch": 2.1058823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 0.5293802618980408, "learning_rate": 3.0196078431372546e-07, "loss": 0.0, "num_tokens": 27993824.0, "reward": 0.6385776996612549, "reward_std": 0.2456386685371399, "rewards/rna_reward_fn/mean": 0.6385776996612549, "rewards/rna_reward_fn/std": 0.36081886291503906, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 140.21875, "completions/mean_terminated_length": 140.21875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.1387496143579483, "epoch": 2.1176470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 0.538530707359314, "learning_rate": 2.980392156862745e-07, "loss": -0.0, "num_tokens": 28138432.0, "reward": 0.6739398241043091, "reward_std": 0.21720820665359497, "rewards/rna_reward_fn/mean": 0.6739398837089539, "rewards/rna_reward_fn/std": 0.30697187781333923, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 118.65625, "completions/mean_terminated_length": 118.65625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.11488081514835358, "epoch": 2.1294117647058823, "frac_reward_zero_std": 0.0, "grad_norm": 0.42285630106925964, "learning_rate": 2.941176470588235e-07, "loss": -0.0, "num_tokens": 28260960.0, "reward": 0.7317262887954712, "reward_std": 0.20456328988075256, "rewards/rna_reward_fn/mean": 0.7317262887954712, "rewards/rna_reward_fn/std": 0.2935360074043274, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 128.8125, "completions/mean_terminated_length": 128.8125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.13038966059684753, "epoch": 2.1411764705882352, "frac_reward_zero_std": 0.0, "grad_norm": 0.43837785720825195, "learning_rate": 2.9019607843137255e-07, "loss": 0.0, "num_tokens": 28393888.0, "reward": 0.7334122657775879, "reward_std": 0.1874283403158188, "rewards/rna_reward_fn/mean": 0.7334122657775879, "rewards/rna_reward_fn/std": 0.3205217123031616, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 142.1875, "completions/mean_terminated_length": 142.1875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 0.142289437353611, "epoch": 2.152941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 0.4689069092273712, "learning_rate": 2.8627450980392154e-07, "loss": -0.0, "num_tokens": 28540512.0, "reward": 0.738664448261261, "reward_std": 0.16794101893901825, "rewards/rna_reward_fn/mean": 0.7386645078659058, "rewards/rna_reward_fn/std": 0.30475351214408875, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 150.1875, "completions/mean_terminated_length": 150.1875, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.13591318577528, "epoch": 2.164705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 0.48003292083740234, "learning_rate": 2.823529411764706e-07, "loss": -0.0, "num_tokens": 28695328.0, "reward": 0.6993162631988525, "reward_std": 0.1979941427707672, "rewards/rna_reward_fn/mean": 0.6993162035942078, "rewards/rna_reward_fn/std": 0.31292685866355896, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 173.65625, "completions/mean_terminated_length": 173.65625, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.15518562495708466, "epoch": 2.176470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 0.6343421339988708, "learning_rate": 2.784313725490196e-07, "loss": -0.0, "num_tokens": 28874176.0, "reward": 0.7311723232269287, "reward_std": 0.2127300500869751, "rewards/rna_reward_fn/mean": 0.7311723232269287, "rewards/rna_reward_fn/std": 0.3124001622200012, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 137.5625, "completions/mean_terminated_length": 137.5625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.1409146785736084, "epoch": 2.1882352941176473, "frac_reward_zero_std": 0.0, "grad_norm": 0.46661409735679626, "learning_rate": 2.7450980392156863e-07, "loss": -0.0, "num_tokens": 29016064.0, "reward": 0.7118009328842163, "reward_std": 0.16496126353740692, "rewards/rna_reward_fn/mean": 0.7118009328842163, "rewards/rna_reward_fn/std": 0.32205572724342346, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 151.21875, "completions/mean_terminated_length": 151.21875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 0.14989649504423141, "epoch": 2.2, "frac_reward_zero_std": 0.0, "grad_norm": 0.44188031554222107, "learning_rate": 2.705882352941176e-07, "loss": -0.0, "num_tokens": 29171936.0, "reward": 0.7327808141708374, "reward_std": 0.17523989081382751, "rewards/rna_reward_fn/mean": 0.7327808141708374, "rewards/rna_reward_fn/std": 0.32806655764579773, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 157.84375, "completions/mean_terminated_length": 157.84375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.14322884380817413, "epoch": 2.211764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 0.5148700475692749, "learning_rate": 2.6666666666666667e-07, "loss": -0.0, "num_tokens": 29334592.0, "reward": 0.6917252540588379, "reward_std": 0.17680642008781433, "rewards/rna_reward_fn/mean": 0.6917252540588379, "rewards/rna_reward_fn/std": 0.30800244212150574, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 164.0, "completions/mean_terminated_length": 164.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.14842171967029572, "epoch": 2.223529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 0.5274482369422913, "learning_rate": 2.6274509803921567e-07, "loss": 0.0, "num_tokens": 29503552.0, "reward": 0.7333264350891113, "reward_std": 0.17190617322921753, "rewards/rna_reward_fn/mean": 0.7333264350891113, "rewards/rna_reward_fn/std": 0.26974406838417053, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 167.875, "completions/mean_terminated_length": 167.875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.12728291004896164, "epoch": 2.235294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 0.4334995746612549, "learning_rate": 2.588235294117647e-07, "loss": -0.0, "num_tokens": 29676480.0, "reward": 0.6551768779754639, "reward_std": 0.18493275344371796, "rewards/rna_reward_fn/mean": 0.6551768779754639, "rewards/rna_reward_fn/std": 0.33756914734840393, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 142.59375, "completions/mean_terminated_length": 142.59375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.13632921129465103, "epoch": 2.2470588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 0.5202718377113342, "learning_rate": 2.549019607843137e-07, "loss": -0.0, "num_tokens": 29823520.0, "reward": 0.779222309589386, "reward_std": 0.1619720160961151, "rewards/rna_reward_fn/mean": 0.779222309589386, "rewards/rna_reward_fn/std": 0.255502849817276, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 141.8125, "completions/mean_terminated_length": 141.8125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.1468304842710495, "epoch": 2.2588235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 0.4959217309951782, "learning_rate": 2.5098039215686275e-07, "loss": 0.0, "num_tokens": 29969760.0, "reward": 0.6328116655349731, "reward_std": 0.20429277420043945, "rewards/rna_reward_fn/mean": 0.6328116655349731, "rewards/rna_reward_fn/std": 0.3653068244457245, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 147.03125, "completions/mean_terminated_length": 147.03125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 0.14507943391799927, "epoch": 2.2705882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 0.46249526739120483, "learning_rate": 2.4705882352941175e-07, "loss": -0.0, "num_tokens": 30121344.0, "reward": 0.6946768760681152, "reward_std": 0.16386722028255463, "rewards/rna_reward_fn/mean": 0.6946768760681152, "rewards/rna_reward_fn/std": 0.3166311979293823, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 119.1875, "completions/mean_terminated_length": 119.1875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.1289873719215393, "epoch": 2.2823529411764705, "frac_reward_zero_std": 0.0, "grad_norm": 0.43832215666770935, "learning_rate": 2.431372549019608e-07, "loss": -0.0, "num_tokens": 30244416.0, "reward": 0.7309268116950989, "reward_std": 0.16351744532585144, "rewards/rna_reward_fn/mean": 0.7309267520904541, "rewards/rna_reward_fn/std": 0.27468279004096985, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 132.40625, "completions/mean_terminated_length": 132.40625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 0.14909712970256805, "epoch": 2.2941176470588234, "frac_reward_zero_std": 0.0, "grad_norm": 0.4866437613964081, "learning_rate": 2.392156862745098e-07, "loss": -0.0, "num_tokens": 30381024.0, "reward": 0.6669021844863892, "reward_std": 0.19414769113063812, "rewards/rna_reward_fn/mean": 0.6669021844863892, "rewards/rna_reward_fn/std": 0.3391817808151245, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 174.0, "completions/mean_terminated_length": 174.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 0.14798294007778168, "epoch": 2.3058823529411763, "frac_reward_zero_std": 0.0, "grad_norm": 0.590640127658844, "learning_rate": 2.352941176470588e-07, "loss": -0.0, "num_tokens": 30560224.0, "reward": 0.6385676860809326, "reward_std": 0.20142759382724762, "rewards/rna_reward_fn/mean": 0.6385676860809326, "rewards/rna_reward_fn/std": 0.34272608160972595, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 125.125, "completions/mean_terminated_length": 125.125, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.1469191089272499, "epoch": 2.317647058823529, "frac_reward_zero_std": 0.0, "grad_norm": 0.4721366763114929, "learning_rate": 2.3137254901960785e-07, "loss": -0.0, "num_tokens": 30689376.0, "reward": 0.7269188165664673, "reward_std": 0.19917072355747223, "rewards/rna_reward_fn/mean": 0.7269188165664673, "rewards/rna_reward_fn/std": 0.3235536217689514, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 170.21875, "completions/mean_terminated_length": 170.21875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.1481616050004959, "epoch": 2.3294117647058825, "frac_reward_zero_std": 0.0, "grad_norm": 0.4824952483177185, "learning_rate": 2.2745098039215685e-07, "loss": 0.0, "num_tokens": 30864704.0, "reward": 0.7315170764923096, "reward_std": 0.19473856687545776, "rewards/rna_reward_fn/mean": 0.7315171360969543, "rewards/rna_reward_fn/std": 0.31163889169692993, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 124.21875, "completions/mean_terminated_length": 124.21875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.11309440433979034, "epoch": 2.3411764705882354, "frac_reward_zero_std": 0.0, "grad_norm": 0.43292057514190674, "learning_rate": 2.235294117647059e-07, "loss": -0.0, "num_tokens": 30992928.0, "reward": 0.6969711184501648, "reward_std": 0.18462812900543213, "rewards/rna_reward_fn/mean": 0.6969711780548096, "rewards/rna_reward_fn/std": 0.30229660868644714, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 115.625, "completions/mean_terminated_length": 115.625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 0.1170443557202816, "epoch": 2.3529411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 0.42780736088752747, "learning_rate": 2.196078431372549e-07, "loss": 0.0, "num_tokens": 31112352.0, "reward": 0.7397186160087585, "reward_std": 0.16325643658638, "rewards/rna_reward_fn/mean": 0.7397185564041138, "rewards/rna_reward_fn/std": 0.2868645191192627, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 191.78125, "completions/mean_terminated_length": 191.78125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 0.158894345164299, "epoch": 2.364705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 0.5923020243644714, "learning_rate": 2.156862745098039e-07, "loss": 0.0, "num_tokens": 31309760.0, "reward": 0.713019609451294, "reward_std": 0.1600976586341858, "rewards/rna_reward_fn/mean": 0.7130196690559387, "rewards/rna_reward_fn/std": 0.3151859641075134, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 167.15625, "completions/mean_terminated_length": 167.15625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.15573827922344208, "epoch": 2.376470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 0.5989984273910522, "learning_rate": 2.1176470588235293e-07, "loss": -0.0, "num_tokens": 31481952.0, "reward": 0.7245238423347473, "reward_std": 0.21510586142539978, "rewards/rna_reward_fn/mean": 0.7245238423347473, "rewards/rna_reward_fn/std": 0.3133554756641388, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 147.15625, "completions/mean_terminated_length": 147.15625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.14043358713388443, "epoch": 2.388235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 0.45242446660995483, "learning_rate": 2.0784313725490195e-07, "loss": 0.0, "num_tokens": 31633664.0, "reward": 0.6685344576835632, "reward_std": 0.19693541526794434, "rewards/rna_reward_fn/mean": 0.6685344576835632, "rewards/rna_reward_fn/std": 0.33878231048583984, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 160.78125, "completions/mean_terminated_length": 160.78125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.14151378720998764, "epoch": 2.4, "frac_reward_zero_std": 0.0, "grad_norm": 0.578268826007843, "learning_rate": 2.0392156862745097e-07, "loss": 0.0, "num_tokens": 31799328.0, "reward": 0.753953218460083, "reward_std": 0.14072492718696594, "rewards/rna_reward_fn/mean": 0.753953218460083, "rewards/rna_reward_fn/std": 0.323638916015625, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 116.71875, "completions/mean_terminated_length": 116.71875, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.14078038185834885, "epoch": 2.411764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 0.5669292211532593, "learning_rate": 2e-07, "loss": 0.0, "num_tokens": 31919872.0, "reward": 0.7278470993041992, "reward_std": 0.18851059675216675, "rewards/rna_reward_fn/mean": 0.7278470993041992, "rewards/rna_reward_fn/std": 0.31520187854766846, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 165.1875, "completions/mean_terminated_length": 165.1875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.1560438796877861, "epoch": 2.4235294117647057, "frac_reward_zero_std": 0.0, "grad_norm": 0.5335204005241394, "learning_rate": 1.96078431372549e-07, "loss": -0.0, "num_tokens": 32090048.0, "reward": 0.74782395362854, "reward_std": 0.16413238644599915, "rewards/rna_reward_fn/mean": 0.74782395362854, "rewards/rna_reward_fn/std": 0.27966901659965515, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 129.75, "completions/mean_terminated_length": 129.75, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.13756585866212845, "epoch": 2.435294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 0.4791547358036041, "learning_rate": 1.9215686274509803e-07, "loss": -0.0, "num_tokens": 32223936.0, "reward": 0.7443541884422302, "reward_std": 0.20347487926483154, "rewards/rna_reward_fn/mean": 0.744354248046875, "rewards/rna_reward_fn/std": 0.2934330999851227, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 144.46875, "completions/mean_terminated_length": 144.46875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 0.14090368151664734, "epoch": 2.447058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 0.48767152428627014, "learning_rate": 1.8823529411764705e-07, "loss": -0.0, "num_tokens": 32372896.0, "reward": 0.7094341516494751, "reward_std": 0.1646713763475418, "rewards/rna_reward_fn/mean": 0.7094341516494751, "rewards/rna_reward_fn/std": 0.31243574619293213, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 121.375, "completions/mean_terminated_length": 121.375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.13812856376171112, "epoch": 2.458823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 0.43114832043647766, "learning_rate": 1.8431372549019607e-07, "loss": -0.0, "num_tokens": 32498208.0, "reward": 0.7636112570762634, "reward_std": 0.1354459822177887, "rewards/rna_reward_fn/mean": 0.7636112570762634, "rewards/rna_reward_fn/std": 0.2837965786457062, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 157.9375, "completions/mean_terminated_length": 157.9375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.12325883284211159, "epoch": 2.4705882352941178, "frac_reward_zero_std": 0.0, "grad_norm": 0.7042959928512573, "learning_rate": 1.803921568627451e-07, "loss": -0.0, "num_tokens": 32660960.0, "reward": 0.685276985168457, "reward_std": 0.14444154500961304, "rewards/rna_reward_fn/mean": 0.685276985168457, "rewards/rna_reward_fn/std": 0.3264351785182953, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 149.28125, "completions/mean_terminated_length": 149.28125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.14060577005147934, "epoch": 2.4823529411764707, "frac_reward_zero_std": 0.0, "grad_norm": 0.7576245665550232, "learning_rate": 1.764705882352941e-07, "loss": 0.0, "num_tokens": 32814848.0, "reward": 0.7403950691223145, "reward_std": 0.19349028170108795, "rewards/rna_reward_fn/mean": 0.7403950691223145, "rewards/rna_reward_fn/std": 0.31960996985435486, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 140.09375, "completions/mean_terminated_length": 140.09375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.128474622964859, "epoch": 2.4941176470588236, "frac_reward_zero_std": 0.0, "grad_norm": 0.4394446611404419, "learning_rate": 1.7254901960784313e-07, "loss": -0.0, "num_tokens": 32959328.0, "reward": 0.7468061447143555, "reward_std": 0.13857056200504303, "rewards/rna_reward_fn/mean": 0.7468062043190002, "rewards/rna_reward_fn/std": 0.2608503997325897, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 144.03125, "completions/mean_terminated_length": 144.03125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.14114519208669662, "epoch": 2.5058823529411764, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5121099352836609, "learning_rate": 1.6862745098039215e-07, "loss": 0.0, "num_tokens": 33107840.0, "reward": 0.6896160244941711, "reward_std": 0.17474885284900665, "rewards/rna_reward_fn/mean": 0.6896160244941711, "rewards/rna_reward_fn/std": 0.30136245489120483, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 196.625, "completions/mean_terminated_length": 196.625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.1554037183523178, "epoch": 2.5176470588235293, "frac_reward_zero_std": 0.0, "grad_norm": 0.5231500864028931, "learning_rate": 1.6470588235294117e-07, "loss": 0.0, "num_tokens": 33310208.0, "reward": 0.7346584796905518, "reward_std": 0.20079070329666138, "rewards/rna_reward_fn/mean": 0.7346584796905518, "rewards/rna_reward_fn/std": 0.30361971259117126, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 138.9375, "completions/mean_terminated_length": 138.9375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.12060126662254333, "epoch": 2.5294117647058822, "frac_reward_zero_std": 0.03125, "grad_norm": 0.45047426223754883, "learning_rate": 1.607843137254902e-07, "loss": 0.0, "num_tokens": 33453504.0, "reward": 0.768707275390625, "reward_std": 0.13694067299365997, "rewards/rna_reward_fn/mean": 0.7687073349952698, "rewards/rna_reward_fn/std": 0.27220436930656433, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 198.78125, "completions/mean_terminated_length": 198.78125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 0.1575038880109787, "epoch": 2.541176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 0.5329861044883728, "learning_rate": 1.5686274509803921e-07, "loss": -0.0, "num_tokens": 33658080.0, "reward": 0.7541199922561646, "reward_std": 0.15449070930480957, "rewards/rna_reward_fn/mean": 0.7541199922561646, "rewards/rna_reward_fn/std": 0.2656092345714569, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 158.34375, "completions/mean_terminated_length": 158.34375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.15501223504543304, "epoch": 2.552941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 0.45992547273635864, "learning_rate": 1.5294117647058823e-07, "loss": 0.0, "num_tokens": 33821248.0, "reward": 0.7572486400604248, "reward_std": 0.15161246061325073, "rewards/rna_reward_fn/mean": 0.7572486400604248, "rewards/rna_reward_fn/std": 0.29167696833610535, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 169.625, "completions/mean_terminated_length": 169.625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.13358986377716064, "epoch": 2.564705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 0.8965858817100525, "learning_rate": 1.4901960784313725e-07, "loss": -0.0, "num_tokens": 33995968.0, "reward": 0.7292990684509277, "reward_std": 0.16865938901901245, "rewards/rna_reward_fn/mean": 0.7292990684509277, "rewards/rna_reward_fn/std": 0.30115416646003723, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 176.78125, "completions/mean_terminated_length": 176.78125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.13434413820505142, "epoch": 2.576470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 0.582165002822876, "learning_rate": 1.4509803921568628e-07, "loss": -0.0, "num_tokens": 34178016.0, "reward": 0.6599196195602417, "reward_std": 0.196761354804039, "rewards/rna_reward_fn/mean": 0.6599196791648865, "rewards/rna_reward_fn/std": 0.33999550342559814, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 156.1875, "completions/mean_terminated_length": 156.1875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.1357617899775505, "epoch": 2.588235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 0.5189464688301086, "learning_rate": 1.411764705882353e-07, "loss": 0.0, "num_tokens": 34338976.0, "reward": 0.7549696564674377, "reward_std": 0.1326015144586563, "rewards/rna_reward_fn/mean": 0.7549696564674377, "rewards/rna_reward_fn/std": 0.2852962613105774, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 148.15625, "completions/mean_terminated_length": 148.15625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 0.15427181124687195, "epoch": 2.6, "frac_reward_zero_std": 0.0, "grad_norm": 0.536194920539856, "learning_rate": 1.3725490196078432e-07, "loss": 0.0, "num_tokens": 34491712.0, "reward": 0.7131255865097046, "reward_std": 0.14100758731365204, "rewards/rna_reward_fn/mean": 0.7131255865097046, "rewards/rna_reward_fn/std": 0.31784212589263916, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 145.1875, "completions/mean_terminated_length": 145.1875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.13709458708763123, "epoch": 2.611764705882353, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5712235569953918, "learning_rate": 1.3333333333333334e-07, "loss": 0.0, "num_tokens": 34641408.0, "reward": 0.7191460132598877, "reward_std": 0.16943207383155823, "rewards/rna_reward_fn/mean": 0.7191460132598877, "rewards/rna_reward_fn/std": 0.3015574514865875, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 145.1875, "completions/mean_terminated_length": 145.1875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.13566020876169205, "epoch": 2.623529411764706, "frac_reward_zero_std": 0.03125, "grad_norm": 0.4192090630531311, "learning_rate": 1.2941176470588236e-07, "loss": 0.0, "num_tokens": 34791104.0, "reward": 0.7555572986602783, "reward_std": 0.16786056756973267, "rewards/rna_reward_fn/mean": 0.7555572986602783, "rewards/rna_reward_fn/std": 0.2797638177871704, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 165.09375, "completions/mean_terminated_length": 165.09375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.12663453072309494, "epoch": 2.635294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 0.6057937145233154, "learning_rate": 1.2549019607843138e-07, "loss": -0.0, "num_tokens": 34961184.0, "reward": 0.6839346289634705, "reward_std": 0.19452279806137085, "rewards/rna_reward_fn/mean": 0.6839346289634705, "rewards/rna_reward_fn/std": 0.33146002888679504, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 167.65625, "completions/mean_terminated_length": 167.65625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.1426771581172943, "epoch": 2.6470588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 0.4763612747192383, "learning_rate": 1.215686274509804e-07, "loss": 0.0, "num_tokens": 35133888.0, "reward": 0.6619032621383667, "reward_std": 0.17893120646476746, "rewards/rna_reward_fn/mean": 0.6619032621383667, "rewards/rna_reward_fn/std": 0.3283209800720215, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 149.9375, "completions/mean_terminated_length": 149.9375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.14778528362512589, "epoch": 2.6588235294117646, "frac_reward_zero_std": 0.0, "grad_norm": 0.4169410169124603, "learning_rate": 1.176470588235294e-07, "loss": -0.0, "num_tokens": 35288448.0, "reward": 0.6732456088066101, "reward_std": 0.16452832520008087, "rewards/rna_reward_fn/mean": 0.6732455492019653, "rewards/rna_reward_fn/std": 0.3249601721763611, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 145.09375, "completions/mean_terminated_length": 145.09375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.1449032723903656, "epoch": 2.6705882352941175, "frac_reward_zero_std": 0.0, "grad_norm": 0.6590065360069275, "learning_rate": 1.1372549019607842e-07, "loss": -0.0, "num_tokens": 35438048.0, "reward": 0.7874460220336914, "reward_std": 0.12049897015094757, "rewards/rna_reward_fn/mean": 0.7874460220336914, "rewards/rna_reward_fn/std": 0.2661431133747101, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 151.75, "completions/mean_terminated_length": 151.75, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.13789667189121246, "epoch": 2.682352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 0.501124918460846, "learning_rate": 1.0980392156862744e-07, "loss": -0.0, "num_tokens": 35594464.0, "reward": 0.76551353931427, "reward_std": 0.14058314263820648, "rewards/rna_reward_fn/mean": 0.7655135989189148, "rewards/rna_reward_fn/std": 0.2855876088142395, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 163.71875, "completions/mean_terminated_length": 163.71875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 0.14094559848308563, "epoch": 2.6941176470588237, "frac_reward_zero_std": 0.03125, "grad_norm": 0.736441433429718, "learning_rate": 1.0588235294117647e-07, "loss": 0.0, "num_tokens": 35763136.0, "reward": 0.6939565539360046, "reward_std": 0.16584208607673645, "rewards/rna_reward_fn/mean": 0.6939565539360046, "rewards/rna_reward_fn/std": 0.32086971402168274, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 139.78125, "completions/mean_terminated_length": 139.78125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.13419293239712715, "epoch": 2.7058823529411766, "frac_reward_zero_std": 0.0, "grad_norm": 0.6264002919197083, "learning_rate": 1.0196078431372549e-07, "loss": -0.0, "num_tokens": 35907296.0, "reward": 0.7488532066345215, "reward_std": 0.1620199978351593, "rewards/rna_reward_fn/mean": 0.7488532066345215, "rewards/rna_reward_fn/std": 0.2980068624019623, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 137.40625, "completions/mean_terminated_length": 137.40625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.13055864721536636, "epoch": 2.7176470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 0.4814888536930084, "learning_rate": 9.80392156862745e-08, "loss": 0.0, "num_tokens": 36049024.0, "reward": 0.6655980348587036, "reward_std": 0.15648490190505981, "rewards/rna_reward_fn/mean": 0.6655980348587036, "rewards/rna_reward_fn/std": 0.35470837354660034, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 130.90625, "completions/mean_terminated_length": 130.90625, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.12380100041627884, "epoch": 2.7294117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 0.583757221698761, "learning_rate": 9.411764705882353e-08, "loss": -0.0, "num_tokens": 36184096.0, "reward": 0.7524540424346924, "reward_std": 0.15423446893692017, "rewards/rna_reward_fn/mean": 0.7524540424346924, "rewards/rna_reward_fn/std": 0.28454405069351196, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 145.34375, "completions/mean_terminated_length": 145.34375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.1325184628367424, "epoch": 2.7411764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 0.4390006959438324, "learning_rate": 9.019607843137255e-08, "loss": -0.0, "num_tokens": 36333952.0, "reward": 0.7277975082397461, "reward_std": 0.19573622941970825, "rewards/rna_reward_fn/mean": 0.7277975082397461, "rewards/rna_reward_fn/std": 0.32145431637763977, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 168.125, "completions/mean_terminated_length": 168.125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.13657083362340927, "epoch": 2.7529411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 0.7681740522384644, "learning_rate": 8.627450980392157e-08, "loss": -0.0, "num_tokens": 36507136.0, "reward": 0.7168524265289307, "reward_std": 0.18613344430923462, "rewards/rna_reward_fn/mean": 0.7168524265289307, "rewards/rna_reward_fn/std": 0.3243979215621948, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 163.875, "completions/mean_terminated_length": 163.875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 0.14333349466323853, "epoch": 2.764705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 0.5657479763031006, "learning_rate": 8.235294117647059e-08, "loss": 0.0, "num_tokens": 36675968.0, "reward": 0.725771427154541, "reward_std": 0.16519448161125183, "rewards/rna_reward_fn/mean": 0.725771427154541, "rewards/rna_reward_fn/std": 0.29766252636909485, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 156.46875, "completions/mean_terminated_length": 156.46875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 0.1441263109445572, "epoch": 2.776470588235294, "frac_reward_zero_std": 0.03125, "grad_norm": 0.4572143256664276, "learning_rate": 7.843137254901961e-08, "loss": 0.0, "num_tokens": 36837216.0, "reward": 0.742597222328186, "reward_std": 0.16114118695259094, "rewards/rna_reward_fn/mean": 0.742597222328186, "rewards/rna_reward_fn/std": 0.29970842599868774, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 158.1875, "completions/mean_terminated_length": 158.1875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.1409977823495865, "epoch": 2.788235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 0.42590776085853577, "learning_rate": 7.450980392156863e-08, "loss": -0.0, "num_tokens": 37000224.0, "reward": 0.7145720720291138, "reward_std": 0.164639413356781, "rewards/rna_reward_fn/mean": 0.7145720720291138, "rewards/rna_reward_fn/std": 0.3098330497741699, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 107.21875, "completions/mean_terminated_length": 107.21875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.11754556372761726, "epoch": 2.8, "frac_reward_zero_std": 0.0, "grad_norm": 0.4764781892299652, "learning_rate": 7.058823529411765e-08, "loss": 0.0, "num_tokens": 37111040.0, "reward": 0.7425558567047119, "reward_std": 0.16547845304012299, "rewards/rna_reward_fn/mean": 0.7425558567047119, "rewards/rna_reward_fn/std": 0.3051395118236542, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 172.84375, "completions/mean_terminated_length": 172.84375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.14019257575273514, "epoch": 2.8117647058823527, "frac_reward_zero_std": 0.0, "grad_norm": 0.5157439708709717, "learning_rate": 6.666666666666667e-08, "loss": -0.0, "num_tokens": 37289056.0, "reward": 0.6816315650939941, "reward_std": 0.2366928905248642, "rewards/rna_reward_fn/mean": 0.6816315650939941, "rewards/rna_reward_fn/std": 0.326466828584671, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 164.15625, "completions/mean_terminated_length": 164.15625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.1466379389166832, "epoch": 2.8235294117647056, "frac_reward_zero_std": 0.0, "grad_norm": 0.5139991044998169, "learning_rate": 6.274509803921569e-08, "loss": 0.0, "num_tokens": 37458176.0, "reward": 0.7532614469528198, "reward_std": 0.1603999137878418, "rewards/rna_reward_fn/mean": 0.7532614469528198, "rewards/rna_reward_fn/std": 0.31244710087776184, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 157.15625, "completions/mean_terminated_length": 157.15625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.12356984615325928, "epoch": 2.835294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 0.9720450043678284, "learning_rate": 5.88235294117647e-08, "loss": -0.0, "num_tokens": 37620128.0, "reward": 0.7346148490905762, "reward_std": 0.15429024398326874, "rewards/rna_reward_fn/mean": 0.7346148490905762, "rewards/rna_reward_fn/std": 0.31154975295066833, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 153.5, "completions/mean_terminated_length": 153.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.1341606229543686, "epoch": 2.847058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 0.5591171979904175, "learning_rate": 5.490196078431372e-08, "loss": -0.0, "num_tokens": 37778336.0, "reward": 0.7116289138793945, "reward_std": 0.21866443753242493, "rewards/rna_reward_fn/mean": 0.7116289138793945, "rewards/rna_reward_fn/std": 0.2980954051017761, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 203.4375, "completions/mean_terminated_length": 203.4375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.14845673739910126, "epoch": 2.8588235294117648, "frac_reward_zero_std": 0.0, "grad_norm": 0.5372319221496582, "learning_rate": 5.098039215686274e-08, "loss": 0.0, "num_tokens": 37987680.0, "reward": 0.7392944693565369, "reward_std": 0.19700977206230164, "rewards/rna_reward_fn/mean": 0.7392945289611816, "rewards/rna_reward_fn/std": 0.30940258502960205, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 139.6875, "completions/mean_terminated_length": 139.6875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.13047143816947937, "epoch": 2.8705882352941177, "frac_reward_zero_std": 0.0, "grad_norm": 0.5987316370010376, "learning_rate": 4.705882352941176e-08, "loss": -0.0, "num_tokens": 38131744.0, "reward": 0.6977779269218445, "reward_std": 0.2151854932308197, "rewards/rna_reward_fn/mean": 0.6977779269218445, "rewards/rna_reward_fn/std": 0.3459690511226654, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 148.40625, "completions/mean_terminated_length": 148.40625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 0.14810562878847122, "epoch": 2.8823529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 0.7430775165557861, "learning_rate": 4.313725490196078e-08, "loss": -0.0, "num_tokens": 38284736.0, "reward": 0.6900802254676819, "reward_std": 0.18723735213279724, "rewards/rna_reward_fn/mean": 0.6900802254676819, "rewards/rna_reward_fn/std": 0.3328934609889984, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 139.5625, "completions/mean_terminated_length": 139.5625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.12182106822729111, "epoch": 2.8941176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 0.49635204672813416, "learning_rate": 3.9215686274509804e-08, "loss": 0.0, "num_tokens": 38428672.0, "reward": 0.7072439193725586, "reward_std": 0.1840672791004181, "rewards/rna_reward_fn/mean": 0.7072439193725586, "rewards/rna_reward_fn/std": 0.3065541088581085, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 167.6875, "completions/mean_terminated_length": 167.6875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.13815301656723022, "epoch": 2.9058823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 0.8550586104393005, "learning_rate": 3.5294117647058824e-08, "loss": -0.0, "num_tokens": 38601408.0, "reward": 0.7532185316085815, "reward_std": 0.1475568264722824, "rewards/rna_reward_fn/mean": 0.7532185316085815, "rewards/rna_reward_fn/std": 0.29489991068840027, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 122.34375, "completions/mean_terminated_length": 122.34375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.12259503453969955, "epoch": 2.9176470588235293, "frac_reward_zero_std": 0.0, "grad_norm": 0.44689512252807617, "learning_rate": 3.1372549019607844e-08, "loss": 0.0, "num_tokens": 38727712.0, "reward": 0.7440149784088135, "reward_std": 0.1674138307571411, "rewards/rna_reward_fn/mean": 0.7440149188041687, "rewards/rna_reward_fn/std": 0.3040436804294586, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 192.0, "completions/mean_terminated_length": 192.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.1282111555337906, "epoch": 2.9294117647058826, "frac_reward_zero_std": 0.0, "grad_norm": 0.5679563879966736, "learning_rate": 2.745098039215686e-08, "loss": 0.0, "num_tokens": 38925344.0, "reward": 0.6850175857543945, "reward_std": 0.19530020654201508, "rewards/rna_reward_fn/mean": 0.6850175857543945, "rewards/rna_reward_fn/std": 0.33921393752098083, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 117.09375, "completions/mean_terminated_length": 117.09375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.12855321913957596, "epoch": 2.9411764705882355, "frac_reward_zero_std": 0.0, "grad_norm": 0.505153238773346, "learning_rate": 2.352941176470588e-08, "loss": -0.0, "num_tokens": 39046272.0, "reward": 0.6269246339797974, "reward_std": 0.16829745471477509, "rewards/rna_reward_fn/mean": 0.6269246339797974, "rewards/rna_reward_fn/std": 0.33109787106513977, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 121.0, "completions/mean_terminated_length": 121.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.12059168517589569, "epoch": 2.9529411764705884, "frac_reward_zero_std": 0.0, "grad_norm": 0.4366406500339508, "learning_rate": 1.9607843137254902e-08, "loss": 0.0, "num_tokens": 39171200.0, "reward": 0.7053718566894531, "reward_std": 0.14770260453224182, "rewards/rna_reward_fn/mean": 0.7053717970848083, "rewards/rna_reward_fn/std": 0.3234374523162842, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 132.1875, "completions/mean_terminated_length": 132.1875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.13018939644098282, "epoch": 2.9647058823529413, "frac_reward_zero_std": 0.0, "grad_norm": 0.6731492280960083, "learning_rate": 1.5686274509803922e-08, "loss": 0.0, "num_tokens": 39307584.0, "reward": 0.7679715752601624, "reward_std": 0.17536047101020813, "rewards/rna_reward_fn/mean": 0.7679715156555176, "rewards/rna_reward_fn/std": 0.2801183760166168, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 145.40625, "completions/mean_terminated_length": 145.40625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.10920717194676399, "epoch": 2.976470588235294, "frac_reward_zero_std": 0.03125, "grad_norm": 0.46245628595352173, "learning_rate": 1.176470588235294e-08, "loss": 0.0, "num_tokens": 39457504.0, "reward": 0.7559751272201538, "reward_std": 0.15144692361354828, "rewards/rna_reward_fn/mean": 0.7559751272201538, "rewards/rna_reward_fn/std": 0.3152746260166168, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 193.3125, "completions/mean_terminated_length": 193.3125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.15460387617349625, "epoch": 2.988235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 0.6124170422554016, "learning_rate": 7.843137254901961e-09, "loss": 0.0, "num_tokens": 39656480.0, "reward": 0.7068374752998352, "reward_std": 0.19490104913711548, "rewards/rna_reward_fn/mean": 0.7068374752998352, "rewards/rna_reward_fn/std": 0.310377836227417, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 149.5, "completions/mean_terminated_length": 149.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.1327020823955536, "epoch": 3.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.5195903778076172, "learning_rate": 3.9215686274509805e-09, "loss": -0.0, "num_tokens": 39810592.0, "reward": 0.7493961453437805, "reward_std": 0.17497789859771729, "rewards/rna_reward_fn/mean": 0.7493961453437805, "rewards/rna_reward_fn/std": 0.31194695830345154, "step": 255 } ], "logging_steps": 1.0, "max_steps": 255, "num_input_tokens_seen": 39810592, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 256, "trial_name": null, "trial_params": null }