{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9935368536853685, "eval_steps": 500, "global_step": 14450, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.875, "completions/mean_terminated_length": 3.875, "completions/min_length": 3.7, "completions/min_terminated_length": 3.7, "epoch": 0.0007638252367858234, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 5.584762921273523e-06, "learning_rate": 3.4351145038167944e-08, "loss": 0.0, "num_tokens": 14515.0, "reward": 0.40999999046325686, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 0.12999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 0.2, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.0800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10 }, { "completion_length": 6.7, "completions/clipped_ratio": 0.0, "completions/max_length": 6.7, "completions/max_terminated_length": 6.7, "completions/mean_length": 5.575, "completions/mean_terminated_length": 5.575, "completions/min_length": 4.3, "completions/min_terminated_length": 4.3, "epoch": 0.0015276504735716467, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 1.2543402516485002e-05, "learning_rate": 7.251908396946566e-08, "loss": 0.0, "num_tokens": 26178.0, "reward": 1.024999976158142, "reward_std": 0.23671360015869142, "rewards/coherence_reward_func/mean": 0.32499998807907104, "rewards/coherence_reward_func/std": 0.07505553364753723, "rewards/formatting_reward_func/mean": 0.5, "rewards/formatting_reward_func/std": 0.1154700517654419, "rewards/quality_reward_func/mean": 0.20000000298023224, "rewards/quality_reward_func/std": 0.046188023686408994, "step": 20 }, { "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 1.5, "completions/mean_terminated_length": 1.5, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.00229147571035747, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 1.8039132919511756e-05, "learning_rate": 1.1068702290076337e-07, "loss": 0.0, "num_tokens": 40390.0, "reward": 0.10249999761581421, "reward_std": 0.20499999523162843, "rewards/coherence_reward_func/mean": 0.032499998807907104, "rewards/coherence_reward_func/std": 0.06499999761581421, "rewards/formatting_reward_func/mean": 0.05, "rewards/formatting_reward_func/std": 0.1, "rewards/quality_reward_func/mean": 0.020000000298023225, "rewards/quality_reward_func/std": 0.04000000059604645, "step": 30 }, { "completion_length": 6.9, "completions/clipped_ratio": 0.0, "completions/max_length": 6.9, "completions/max_terminated_length": 6.9, "completions/mean_length": 6.3, "completions/mean_terminated_length": 6.3, "completions/min_length": 5.6, "completions/min_terminated_length": 5.6, "epoch": 0.0030553009471432934, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 4.868981123611338e-05, "learning_rate": 1.4885496183206107e-07, "loss": 0.0, "num_tokens": 54058.0, "reward": 0.9599999785423279, "reward_std": 0.16165807247161865, "rewards/coherence_reward_func/mean": 0.32499998807907104, "rewards/coherence_reward_func/std": 0.07505553364753723, "rewards/formatting_reward_func/mean": 0.475, "rewards/formatting_reward_func/std": 0.08660253882408142, "rewards/quality_reward_func/mean": 0.1600000023841858, "rewards/quality_reward_func/std": 0.0, "step": 40 }, { "completion_length": 2.2, "completions/clipped_ratio": 0.0, "completions/max_length": 2.2, "completions/max_terminated_length": 2.2, "completions/mean_length": 1.3, "completions/mean_terminated_length": 1.3, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.003819126183929117, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 4.7134438500506806e-05, "learning_rate": 1.870229007633588e-07, "loss": 0.0, "num_tokens": 67990.0, "reward": 0.10249999761581421, "reward_std": 0.20499999523162843, "rewards/coherence_reward_func/mean": 0.032499998807907104, "rewards/coherence_reward_func/std": 0.06499999761581421, "rewards/formatting_reward_func/mean": 0.05, "rewards/formatting_reward_func/std": 0.1, "rewards/quality_reward_func/mean": 0.020000000298023225, "rewards/quality_reward_func/std": 0.04000000059604645, "step": 50 }, { "completion_length": 1.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1.0, "completions/max_terminated_length": 1.0, "completions/mean_length": 1.0, "completions/mean_terminated_length": 1.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.004125412541254125, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.6421079635620118e-06, "learning_rate": 1.9666666666666668e-06, "loss": 0.0, "num_tokens": 79438.0, "reward": 0.0, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 0.0, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 0.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.0, "rewards/quality_reward_func/std": 0.0, "step": 60 }, { "completion_length": 1.6, "completions/clipped_ratio": 0.0, "completions/max_length": 1.6, "completions/max_terminated_length": 1.6, "completions/mean_length": 1.6, "completions/mean_terminated_length": 1.6, "completions/min_length": 1.6, "completions/min_terminated_length": 1.6, "epoch": 0.004812981298129813, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.8922347294392238e-06, "learning_rate": 2.3000000000000004e-06, "loss": 0.0, "num_tokens": 92538.0, "reward": 0.1100000023841858, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 0.06000000238418579, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 0.05, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.0, "rewards/quality_reward_func/std": 0.0, "step": 70 }, { "completion_length": 1.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1.0, "completions/max_terminated_length": 1.0, "completions/mean_length": 1.0, "completions/mean_terminated_length": 1.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.005500550055005501, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.56402587890625e-05, "learning_rate": 2.6333333333333332e-06, "loss": 0.0, "num_tokens": 106926.0, "reward": 0.0, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 0.0, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 0.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.0, "rewards/quality_reward_func/std": 0.0, "step": 80 }, { "completion_length": 1.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1.0, "completions/max_terminated_length": 1.0, "completions/mean_length": 1.0, "completions/mean_terminated_length": 1.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.006188118811881188, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 2.9624998569488524e-05, "learning_rate": 2.9666666666666673e-06, "loss": 0.0, "num_tokens": 120118.0, "reward": 0.0, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 0.0, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 0.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.0, "rewards/quality_reward_func/std": 0.0, "step": 90 }, { "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 1.25, "completions/mean_terminated_length": 1.25, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.006875687568756876, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 0.0018734597397269681, "learning_rate": 3.3000000000000006e-06, "loss": 0.0, "num_tokens": 134128.0, "reward": 0.0899999976158142, "reward_std": 0.1799999952316284, "rewards/coherence_reward_func/mean": 0.032499998807907104, "rewards/coherence_reward_func/std": 0.06499999761581421, "rewards/formatting_reward_func/mean": 0.05, "rewards/formatting_reward_func/std": 0.1, "rewards/quality_reward_func/mean": 0.007500000298023224, "rewards/quality_reward_func/std": 0.015000002086162567, "step": 100 }, { "completion_length": 5.9, "completions/clipped_ratio": 0.0, "completions/max_length": 5.9, "completions/max_terminated_length": 5.9, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.007563256325632563, "frac_reward_zero_std": 0.7, "grad_norm": 0.5969054698944092, "kl": 0.3142867418937385, "learning_rate": 3.633333333333334e-06, "loss": 0.0, "num_tokens": 149936.0, "reward": 0.6974999606609344, "reward_std": 0.574999988079071, "rewards/coherence_reward_func/mean": 0.21999999284744262, "rewards/coherence_reward_func/std": 0.1799999952316284, "rewards/formatting_reward_func/mean": 0.3375, "rewards/formatting_reward_func/std": 0.275, "rewards/quality_reward_func/mean": 0.14000000506639482, "rewards/quality_reward_func/std": 0.12000000178813934, "step": 110 }, { "completion_length": 10.3, "completions/clipped_ratio": 0.0, "completions/max_length": 10.3, "completions/max_terminated_length": 10.3, "completions/mean_length": 7.55, "completions/mean_terminated_length": 7.55, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.00825082508250825, "frac_reward_zero_std": 0.7, "grad_norm": 0.0026277885772287846, "kl": 470.12245586041826, "learning_rate": 3.966666666666667e-06, "loss": 0.0119, "num_tokens": 164358.0, "reward": 1.6574999570846558, "reward_std": 0.27903410643339155, "rewards/coherence_reward_func/mean": 0.5399999856948853, "rewards/coherence_reward_func/std": 0.09237603992223739, "rewards/formatting_reward_func/mean": 0.8375, "rewards/formatting_reward_func/std": 0.1404700517654419, "rewards/quality_reward_func/mean": 0.2800000041723251, "rewards/quality_reward_func/std": 0.046188023686408994, "step": 120 }, { "completion_length": 18.6, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 15.8, "completions/mean_terminated_length": 15.8, "completions/min_length": 11.9, "completions/min_terminated_length": 11.9, "epoch": 0.008938393839383938, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "kl": 0.46882193982601167, "learning_rate": 4.3e-06, "loss": 0.0, "num_tokens": 178634.0, "reward": 3.5749999046325684, "reward_std": 0.22999999523162842, "rewards/coherence_reward_func/mean": 1.1374999582767487, "rewards/coherence_reward_func/std": 0.06499999761581421, "rewards/formatting_reward_func/mean": 1.75, "rewards/formatting_reward_func/std": 0.1, "rewards/quality_reward_func/mean": 0.6875000119209289, "rewards/quality_reward_func/std": 0.06500000059604645, "step": 130 }, { "completion_length": 20.6, "completions/clipped_ratio": 0.0, "completions/max_length": 20.6, "completions/max_terminated_length": 20.6, "completions/mean_length": 16.4, "completions/mean_terminated_length": 16.4, "completions/min_length": 11.8, "completions/min_terminated_length": 11.8, "epoch": 0.009625962596259627, "frac_reward_zero_std": 0.7, "grad_norm": 0.0, "kl": 0.4925146855413914, "learning_rate": 4.633333333333334e-06, "loss": 0.0, "num_tokens": 192926.0, "reward": 3.2499999046325683, "reward_std": 0.44126754999160767, "rewards/coherence_reward_func/mean": 1.0424999654293061, "rewards/coherence_reward_func/std": 0.14356523752212524, "rewards/formatting_reward_func/mean": 1.6125, "rewards/formatting_reward_func/std": 0.21160253882408142, "rewards/quality_reward_func/mean": 0.595000010728836, "rewards/quality_reward_func/std": 0.10350853204727173, "step": 140 }, { "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 17.825, "completions/mean_terminated_length": 17.825, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.010313531353135313, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 1.0868607074022294, "learning_rate": 4.966666666666667e-06, "loss": 0.0, "num_tokens": 206363.0, "reward": 4.059999895095825, "reward_std": 0.07999999523162842, "rewards/coherence_reward_func/mean": 1.2924999475479126, "rewards/coherence_reward_func/std": 0.01499999761581421, "rewards/formatting_reward_func/mean": 1.9875, "rewards/formatting_reward_func/std": 0.025, "rewards/quality_reward_func/mean": 0.7800000131130218, "rewards/quality_reward_func/std": 0.04000000059604645, "step": 150 }, { "completion_length": 31.1, "completions/clipped_ratio": 0.0, "completions/max_length": 31.1, "completions/max_terminated_length": 31.1, "completions/mean_length": 20.3, "completions/mean_terminated_length": 20.3, "completions/min_length": 14.3, "completions/min_terminated_length": 14.3, "epoch": 0.011001100110011002, "frac_reward_zero_std": 0.9, "grad_norm": 0.0007889735861681402, "kl": 0.7975522613618523, "learning_rate": 4.999451708687114e-06, "loss": 0.0, "num_tokens": 219259.0, "reward": 4.079999899864196, "reward_std": 0.04000000059604645, "rewards/coherence_reward_func/mean": 1.2924999475479126, "rewards/coherence_reward_func/std": 0.01499999761581421, "rewards/formatting_reward_func/mean": 1.9875, "rewards/formatting_reward_func/std": 0.025, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 160 }, { "completion_length": 34.6, "completions/clipped_ratio": 0.0, "completions/max_length": 34.6, "completions/max_terminated_length": 34.6, "completions/mean_length": 23.1, "completions/mean_terminated_length": 23.1, "completions/min_length": 17.5, "completions/min_terminated_length": 17.5, "epoch": 0.011688668866886688, "frac_reward_zero_std": 0.8, "grad_norm": 0.419414222240448, "kl": 0.7056910984218121, "learning_rate": 4.9975566894538954e-06, "loss": 0.0, "num_tokens": 235091.0, "reward": 4.077499866485596, "reward_std": 0.03232050836086273, "rewards/coherence_reward_func/mean": 1.2774999499320985, "rewards/coherence_reward_func/std": 0.03232050389051437, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 170 }, { "completion_length": 22.9, "completions/clipped_ratio": 0.0, "completions/max_length": 22.9, "completions/max_terminated_length": 22.9, "completions/mean_length": 18.05, "completions/mean_terminated_length": 18.05, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.012376237623762377, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 1124.961197933089, "learning_rate": 4.994309199213748e-06, "loss": 0.0461, "num_tokens": 247253.0, "reward": 4.074999904632568, "reward_std": 0.028867512941360474, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.7750000119209289, "rewards/quality_reward_func/std": 0.028867512941360474, "step": 180 }, { "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.5, "completions/max_terminated_length": 22.5, "completions/mean_length": 20.05, "completions/mean_terminated_length": 20.05, "completions/min_length": 17.4, "completions/min_terminated_length": 17.4, "epoch": 0.013063806380638063, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.693901395983994, "learning_rate": 4.989710996539926e-06, "loss": 0.0, "num_tokens": 261675.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 190 }, { "completion_length": 25.2, "completions/clipped_ratio": 0.0, "completions/max_length": 25.2, "completions/max_terminated_length": 25.2, "completions/mean_length": 21.575, "completions/mean_terminated_length": 21.575, "completions/min_length": 17.8, "completions/min_terminated_length": 17.8, "epoch": 0.013751375137513752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7635734604671598, "learning_rate": 4.983764571440296e-06, "loss": 0.0, "num_tokens": 277602.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 200 }, { "completion_length": 24.8, "completions/clipped_ratio": 0.0, "completions/max_length": 24.8, "completions/max_terminated_length": 24.8, "completions/mean_length": 21.266666666666666, "completions/mean_terminated_length": 21.266666666666666, "completions/min_length": 17.666666666666668, "completions/min_terminated_length": 17.666666666666668, "epoch": 0.014438943894389438, "frac_reward_zero_std": 0.9333333333333333, "grad_norm": 0.0, "kl": 0.8949815820577594, "learning_rate": 4.9764731440089494e-06, "loss": 0.0, "num_tokens": 290559.0, "reward": 4.001666577657064, "reward_std": 0.030000003178914388, "rewards/coherence_reward_func/mean": 1.2599999586741129, "rewards/coherence_reward_func/std": 0.013333333532015483, "rewards/formatting_reward_func/mean": 1.975, "rewards/formatting_reward_func/std": 0.016666666666666666, "rewards/quality_reward_func/mean": 0.7666666785875956, "rewards/quality_reward_func/std": 0.0, "step": 210 }, { "completion_length": 20.9, "completions/clipped_ratio": 0.0, "completions/max_length": 20.9, "completions/max_terminated_length": 20.9, "completions/mean_length": 18.7, "completions/mean_terminated_length": 18.7, "completions/min_length": 17.1, "completions/min_terminated_length": 17.1, "epoch": 0.015126512651265127, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 0.8888865269720554, "learning_rate": 4.96784066268247e-06, "loss": 0.0, "num_tokens": 305655.0, "reward": 4.087499904632568, "reward_std": 0.025, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.787500011920929, "rewards/quality_reward_func/std": 0.025, "step": 220 }, { "completion_length": 22.8, "completions/clipped_ratio": 0.0, "completions/max_length": 22.8, "completions/max_terminated_length": 22.8, "completions/mean_length": 18.1, "completions/mean_terminated_length": 18.1, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.015814081408140813, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.6253720429260283, "learning_rate": 4.957871802101782e-06, "loss": 0.0, "num_tokens": 319763.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 230 }, { "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.725, "completions/mean_terminated_length": 17.725, "completions/min_length": 14.6, "completions/min_terminated_length": 14.6, "epoch": 0.0165016501650165, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.773795984685421, "learning_rate": 4.9465719605807505e-06, "loss": 0.0, "num_tokens": 332104.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 240 }, { "completion_length": 22.4, "completions/clipped_ratio": 0.0, "completions/max_length": 22.4, "completions/max_terminated_length": 22.4, "completions/mean_length": 18.65, "completions/mean_terminated_length": 18.65, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.01718921892189219, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7744052636437118, "learning_rate": 4.933947257182901e-06, "loss": 0.0, "num_tokens": 347002.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 250 }, { "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 17.225, "completions/mean_terminated_length": 17.225, "completions/min_length": 14.6, "completions/min_terminated_length": 14.6, "epoch": 0.017876787678767877, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 0.8353285151533782, "learning_rate": 4.920004528407837e-06, "loss": 0.0, "num_tokens": 360267.0, "reward": 4.087499904632568, "reward_std": 0.025, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.787500011920929, "rewards/quality_reward_func/std": 0.025, "step": 260 }, { "completion_length": 28.2, "completions/clipped_ratio": 0.0, "completions/max_length": 28.2, "completions/max_terminated_length": 28.2, "completions/mean_length": 23.05, "completions/mean_terminated_length": 23.05, "completions/min_length": 16.9, "completions/min_terminated_length": 16.9, "epoch": 0.018564356435643563, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "kl": 0.7147501368832309, "learning_rate": 4.904751324489156e-06, "loss": 0.0, "num_tokens": 374665.0, "reward": 4.0024998664855955, "reward_std": 0.07500000447034835, "rewards/coherence_reward_func/mean": 1.2399999499320984, "rewards/coherence_reward_func/std": 0.04999999403953552, "rewards/formatting_reward_func/mean": 1.9625, "rewards/formatting_reward_func/std": 0.025, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 270 }, { "completion_length": 22.4, "completions/clipped_ratio": 0.0, "completions/max_length": 22.4, "completions/max_terminated_length": 22.4, "completions/mean_length": 19.775, "completions/mean_terminated_length": 19.775, "completions/min_length": 16.9, "completions/min_terminated_length": 16.9, "epoch": 0.019251925192519254, "frac_reward_zero_std": 0.9, "grad_norm": 0.09396308660507202, "kl": 0.710376477369573, "learning_rate": 4.888195905305859e-06, "loss": 0.0, "num_tokens": 389896.0, "reward": 4.087499904632568, "reward_std": 0.025, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.787500011920929, "rewards/quality_reward_func/std": 0.025, "step": 280 }, { "completion_length": 22.3, "completions/clipped_ratio": 0.0, "completions/max_length": 22.3, "completions/max_terminated_length": 22.3, "completions/mean_length": 17.925, "completions/mean_terminated_length": 17.925, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.01993949394939494, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.5978727843612432, "learning_rate": 4.870347235909494e-06, "loss": 0.0, "num_tokens": 403949.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 290 }, { "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.5, "completions/max_terminated_length": 30.5, "completions/mean_length": 22.425, "completions/mean_terminated_length": 22.425, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.020627062706270627, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 0.7130264882929623, "learning_rate": 4.851214981669406e-06, "loss": 0.0, "num_tokens": 418738.0, "reward": 4.092499876022339, "reward_std": 0.015000002086162567, "rewards/coherence_reward_func/mean": 1.2924999475479126, "rewards/coherence_reward_func/std": 0.01499999761581421, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 300 }, { "completion_length": 24.3, "completions/clipped_ratio": 0.0, "completions/max_length": 24.3, "completions/max_terminated_length": 24.3, "completions/mean_length": 20.675, "completions/mean_terminated_length": 20.675, "completions/min_length": 18.2, "completions/min_terminated_length": 18.2, "epoch": 0.021314631463146314, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.6356086356565356, "learning_rate": 4.830809503038781e-06, "loss": 0.0, "num_tokens": 434841.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 310 }, { "completion_length": 24.2, "completions/clipped_ratio": 0.0, "completions/max_length": 24.2, "completions/max_terminated_length": 24.2, "completions/mean_length": 19.875, "completions/mean_terminated_length": 19.875, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.022002200220022004, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8585543476045132, "learning_rate": 4.809141849944271e-06, "loss": 0.0, "num_tokens": 450032.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 320 }, { "completion_length": 24.1, "completions/clipped_ratio": 0.0, "completions/max_length": 24.1, "completions/max_terminated_length": 24.1, "completions/mean_length": 20.35, "completions/mean_terminated_length": 20.35, "completions/min_length": 17.2, "completions/min_terminated_length": 17.2, "epoch": 0.02268976897689769, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7699921066872776, "learning_rate": 4.786223755802268e-06, "loss": 0.0, "num_tokens": 464362.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 330 }, { "completion_length": 18.4, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 16.55, "completions/mean_terminated_length": 16.55, "completions/min_length": 14.8, "completions/min_terminated_length": 14.8, "epoch": 0.023377337733773377, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7013043455779553, "learning_rate": 4.762067631165049e-06, "loss": 0.0, "num_tokens": 478792.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 340 }, { "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.075, "completions/mean_terminated_length": 16.075, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.024064906490649064, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7579462625086307, "learning_rate": 4.736686557000247e-06, "loss": 0.0, "num_tokens": 492155.0, "reward": 3.6899999141693116, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.1699999570846558, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 1.8, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.720000010728836, "rewards/quality_reward_func/std": 0.0, "step": 350 }, { "completion_length": 36.4, "completions/clipped_ratio": 0.0, "completions/max_length": 36.4, "completions/max_terminated_length": 36.4, "completions/mean_length": 27.95, "completions/mean_terminated_length": 27.95, "completions/min_length": 19.3, "completions/min_terminated_length": 19.3, "epoch": 0.024752475247524754, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 0.523247461207211, "learning_rate": 4.710094277607269e-06, "loss": 0.0, "num_tokens": 508017.0, "reward": 4.077499914169311, "reward_std": 0.04499998986721039, "rewards/coherence_reward_func/mean": 1.2899999499320984, "rewards/coherence_reward_func/std": 0.020000000298023225, "rewards/formatting_reward_func/mean": 1.9875, "rewards/formatting_reward_func/std": 0.025, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 360 }, { "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 17.825, "completions/mean_terminated_length": 17.825, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.02544004400440044, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 0.8155595321208239, "learning_rate": 4.682305193174524e-06, "loss": 0.0, "num_tokens": 521294.0, "reward": 4.087499904632568, "reward_std": 0.025, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.787500011920929, "rewards/quality_reward_func/std": 0.025, "step": 370 }, { "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.5, "completions/max_terminated_length": 23.5, "completions/mean_length": 20.825, "completions/mean_terminated_length": 20.825, "completions/min_length": 18.1, "completions/min_terminated_length": 18.1, "epoch": 0.026127612761276127, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7052144195884467, "learning_rate": 4.653334351981464e-06, "loss": 0.0, "num_tokens": 536163.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 380 }, { "completion_length": 23.2, "completions/clipped_ratio": 0.0, "completions/max_length": 23.2, "completions/max_terminated_length": 23.2, "completions/mean_length": 19.45, "completions/mean_terminated_length": 19.45, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.026815181518151814, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.798173398245126, "learning_rate": 4.623197442249667e-06, "loss": 0.0, "num_tokens": 552493.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 390 }, { "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.675, "completions/mean_terminated_length": 21.675, "completions/min_length": 18.2, "completions/min_terminated_length": 18.2, "epoch": 0.027502750275027504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.6294201251119376, "learning_rate": 4.591910783647405e-06, "loss": 0.0, "num_tokens": 567780.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 400 }, { "completion_length": 20.6, "completions/clipped_ratio": 0.0, "completions/max_length": 20.6, "completions/max_terminated_length": 20.6, "completions/mean_length": 18.425, "completions/mean_terminated_length": 18.425, "completions/min_length": 16.7, "completions/min_terminated_length": 16.7, "epoch": 0.02819031903190319, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 0.8350168326869607, "learning_rate": 4.559491318452238e-06, "loss": 0.0, "num_tokens": 579861.0, "reward": 4.06999990940094, "reward_std": 0.059999996423721315, "rewards/coherence_reward_func/mean": 1.2824999570846558, "rewards/coherence_reward_func/std": 0.034999996423721313, "rewards/formatting_reward_func/mean": 1.9875, "rewards/formatting_reward_func/std": 0.025, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 410 }, { "completion_length": 25.1, "completions/clipped_ratio": 0.0, "completions/max_length": 25.1, "completions/max_terminated_length": 25.1, "completions/mean_length": 20.95, "completions/mean_terminated_length": 20.95, "completions/min_length": 17.4, "completions/min_terminated_length": 17.4, "epoch": 0.028877887788778877, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.6226312205195427, "learning_rate": 4.525956602376486e-06, "loss": 0.0, "num_tokens": 593839.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 420 }, { "completion_length": 21.7, "completions/clipped_ratio": 0.0, "completions/max_length": 21.7, "completions/max_terminated_length": 21.7, "completions/mean_length": 19.275, "completions/mean_terminated_length": 19.275, "completions/min_length": 17.3, "completions/min_terminated_length": 17.3, "epoch": 0.029565456545654567, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 0.6491609741002321, "learning_rate": 4.491324795060491e-06, "loss": 0.0, "num_tokens": 609682.0, "reward": 4.077499914169311, "reward_std": 0.01499999314546585, "rewards/coherence_reward_func/mean": 1.2774999618530274, "rewards/coherence_reward_func/std": 0.01499999761581421, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 430 }, { "completion_length": 20.3, "completions/clipped_ratio": 0.0, "completions/max_length": 20.3, "completions/max_terminated_length": 20.3, "completions/mean_length": 17.6, "completions/mean_terminated_length": 17.6, "completions/min_length": 14.6, "completions/min_terminated_length": 14.6, "epoch": 0.030253025302530254, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.5745274079963565, "learning_rate": 4.455614650238858e-06, "loss": 0.0, "num_tokens": 623798.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 440 }, { "completion_length": 24.1, "completions/clipped_ratio": 0.0, "completions/max_length": 24.1, "completions/max_terminated_length": 24.1, "completions/mean_length": 19.7, "completions/mean_terminated_length": 19.7, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.03094059405940594, "frac_reward_zero_std": 1.0, "grad_norm": 0.00047155821812339127, "kl": 0.6368549924343825, "learning_rate": 4.418845505584972e-06, "loss": 0.0, "num_tokens": 637042.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 450 }, { "completion_length": 22.9, "completions/clipped_ratio": 0.0, "completions/max_length": 22.9, "completions/max_terminated_length": 22.9, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.03162816281628163, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7010985561646521, "learning_rate": 4.381037272239311e-06, "loss": 0.0, "num_tokens": 652746.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 460 }, { "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.5, "completions/max_terminated_length": 21.5, "completions/mean_length": 18.65, "completions/mean_terminated_length": 18.65, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.032315731573157314, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8845027022063732, "learning_rate": 4.34221042402721e-06, "loss": 0.0, "num_tokens": 666280.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 470 }, { "completion_length": 21.8, "completions/clipped_ratio": 0.0, "completions/max_length": 21.8, "completions/max_terminated_length": 21.8, "completions/mean_length": 18.95, "completions/mean_terminated_length": 18.95, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.033003300330033, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0147626213729382, "learning_rate": 4.302385986371924e-06, "loss": 0.0, "num_tokens": 680162.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 480 }, { "completion_length": 19.3, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 16.675, "completions/mean_terminated_length": 16.675, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.033690869086908694, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1609789356589317, "learning_rate": 4.261585524908987e-06, "loss": 0.0, "num_tokens": 694665.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 490 }, { "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.5, "completions/max_terminated_length": 23.5, "completions/mean_length": 20.45, "completions/mean_terminated_length": 20.45, "completions/min_length": 17.9, "completions/min_terminated_length": 17.9, "epoch": 0.03437843784378438, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.47180836610496046, "learning_rate": 4.2198311338080466e-06, "loss": 0.0, "num_tokens": 711203.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 500 }, { "completion_length": 22.1, "completions/clipped_ratio": 0.0, "completions/max_length": 22.1, "completions/max_terminated_length": 22.1, "completions/mean_length": 16.975, "completions/mean_terminated_length": 16.975, "completions/min_length": 13.9, "completions/min_terminated_length": 13.9, "epoch": 0.03506600660066007, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7761064074933529, "learning_rate": 4.177145423808477e-06, "loss": 0.0, "num_tokens": 723994.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 510 }, { "completion_length": 27.9, "completions/clipped_ratio": 0.0, "completions/max_length": 27.9, "completions/max_terminated_length": 27.9, "completions/mean_length": 21.05, "completions/mean_terminated_length": 21.05, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.035753575357535754, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 0.9148745775222779, "learning_rate": 4.133551509975264e-06, "loss": 0.0, "num_tokens": 738352.0, "reward": 4.092499876022339, "reward_std": 0.015000002086162567, "rewards/coherence_reward_func/mean": 1.2924999475479126, "rewards/coherence_reward_func/std": 0.01499999761581421, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 520 }, { "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.5, "completions/max_terminated_length": 24.5, "completions/mean_length": 18.05, "completions/mean_terminated_length": 18.05, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.03644114411441144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8287635251879693, "learning_rate": 4.089072999181792e-06, "loss": 0.0, "num_tokens": 755102.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 530 }, { "completion_length": 20.6, "completions/clipped_ratio": 0.0, "completions/max_length": 20.6, "completions/max_terminated_length": 20.6, "completions/mean_length": 16.975, "completions/mean_terminated_length": 16.975, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.03712871287128713, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 0.6910347867757082, "learning_rate": 4.043733977326304e-06, "loss": 0.0, "num_tokens": 768025.0, "reward": 3.8949999094009398, "reward_std": 0.23671360015869142, "rewards/coherence_reward_func/mean": 1.23499995470047, "rewards/coherence_reward_func/std": 0.07505553364753723, "rewards/formatting_reward_func/mean": 1.9, "rewards/formatting_reward_func/std": 0.1154700517654419, "rewards/quality_reward_func/mean": 0.7600000113248825, "rewards/quality_reward_func/std": 0.046188023686408994, "step": 540 }, { "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.85, "completions/mean_terminated_length": 19.85, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.037816281628162814, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.6972396697849035, "learning_rate": 3.997558996288965e-06, "loss": 0.0, "num_tokens": 782131.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 550 }, { "completion_length": 23.7, "completions/clipped_ratio": 0.0, "completions/max_length": 23.7, "completions/max_terminated_length": 23.7, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.03850385038503851, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.6881673350930214, "learning_rate": 3.9505730606365826e-06, "loss": 0.0, "num_tokens": 797371.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 560 }, { "completion_length": 22.1, "completions/clipped_ratio": 0.0, "completions/max_length": 22.1, "completions/max_terminated_length": 22.1, "completions/mean_length": 18.05, "completions/mean_terminated_length": 18.05, "completions/min_length": 14.8, "completions/min_terminated_length": 14.8, "epoch": 0.039191419141914194, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.6732351541519165, "learning_rate": 3.902801614082195e-06, "loss": 0.0, "num_tokens": 812681.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 570 }, { "completion_length": 19.4, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 16.35, "completions/mean_terminated_length": 16.35, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.03987898789878988, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8790098778903485, "learning_rate": 3.85427052570685e-06, "loss": 0.0, "num_tokens": 828143.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 580 }, { "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.5, "completions/max_terminated_length": 25.5, "completions/mean_length": 20.025, "completions/mean_terminated_length": 20.025, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.04056655665566557, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 0.6692779961973429, "learning_rate": 3.8050060759510453e-06, "loss": 0.0, "num_tokens": 843544.0, "reward": 4.092499876022339, "reward_std": 0.015000002086162567, "rewards/coherence_reward_func/mean": 1.2924999475479126, "rewards/coherence_reward_func/std": 0.01499999761581421, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 590 }, { "completion_length": 22.9, "completions/clipped_ratio": 0.0, "completions/max_length": 22.9, "completions/max_terminated_length": 22.9, "completions/mean_length": 20.725, "completions/mean_terminated_length": 20.725, "completions/min_length": 18.4, "completions/min_terminated_length": 18.4, "epoch": 0.041254125412541254, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.842956809559837, "learning_rate": 3.755034942383401e-06, "loss": 0.0, "num_tokens": 858861.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 600 }, { "completion_length": 21.1, "completions/clipped_ratio": 0.0, "completions/max_length": 21.1, "completions/max_terminated_length": 21.1, "completions/mean_length": 17.675, "completions/mean_terminated_length": 17.675, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.04194169416941694, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.741293515264988, "learning_rate": 3.7043841852542884e-06, "loss": 0.0, "num_tokens": 872064.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 610 }, { "completion_length": 21.9, "completions/clipped_ratio": 0.0, "completions/max_length": 21.9, "completions/max_terminated_length": 21.9, "completions/mean_length": 19.125, "completions/mean_terminated_length": 19.125, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.04262926292629263, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8384895706549287, "learning_rate": 3.6530812328422272e-06, "loss": 0.0, "num_tokens": 886225.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 620 }, { "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.5, "completions/max_terminated_length": 21.5, "completions/mean_length": 19.625, "completions/mean_terminated_length": 19.625, "completions/min_length": 17.8, "completions/min_terminated_length": 17.8, "epoch": 0.043316831683168314, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7357438832521439, "learning_rate": 3.6011538666009877e-06, "loss": 0.0, "num_tokens": 902306.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 630 }, { "completion_length": 22.9, "completions/clipped_ratio": 0.0, "completions/max_length": 22.9, "completions/max_terminated_length": 22.9, "completions/mean_length": 19.075, "completions/mean_terminated_length": 19.075, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.04400440044004401, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8144454814493656, "learning_rate": 3.5486302061154433e-06, "loss": 0.0, "num_tokens": 915053.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 640 }, { "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.525, "completions/mean_terminated_length": 15.525, "completions/min_length": 13.4, "completions/min_terminated_length": 13.4, "epoch": 0.044691969196919694, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.821483525633812, "learning_rate": 3.4955386938743217e-06, "loss": 0.0, "num_tokens": 928142.0, "reward": 4.0499999046325685, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.7500000119209289, "rewards/quality_reward_func/std": 0.0, "step": 650 }, { "completion_length": 21.1, "completions/clipped_ratio": 0.0, "completions/max_length": 21.1, "completions/max_terminated_length": 21.1, "completions/mean_length": 17.85, "completions/mean_terminated_length": 17.85, "completions/min_length": 14.2, "completions/min_terminated_length": 14.2, "epoch": 0.04537953795379538, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8167589947581291, "learning_rate": 3.4419080798680934e-06, "loss": 0.0, "num_tokens": 943068.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 660 }, { "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.825, "completions/mean_terminated_length": 21.825, "completions/min_length": 18.7, "completions/min_terminated_length": 18.7, "epoch": 0.04606710671067107, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7352058874210343, "learning_rate": 3.387767406020343e-06, "loss": 0.0, "num_tokens": 957793.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 670 }, { "completion_length": 20.3, "completions/clipped_ratio": 0.0, "completions/max_length": 20.3, "completions/max_terminated_length": 20.3, "completions/mean_length": 18.025, "completions/mean_terminated_length": 18.025, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.046754675467546754, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9711360923945904, "learning_rate": 3.333145990461061e-06, "loss": 0.0, "num_tokens": 973398.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 680 }, { "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.5, "completions/max_terminated_length": 22.5, "completions/mean_length": 19.35, "completions/mean_terminated_length": 19.35, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.04744224422442244, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.6931308001279831, "learning_rate": 3.2780734116503504e-06, "loss": 0.0, "num_tokens": 986624.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 690 }, { "completion_length": 22.2, "completions/clipped_ratio": 0.0, "completions/max_length": 22.2, "completions/max_terminated_length": 22.2, "completions/mean_length": 17.6, "completions/mean_terminated_length": 17.6, "completions/min_length": 13.5, "completions/min_terminated_length": 13.5, "epoch": 0.04812981298129813, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.913802171498537, "learning_rate": 3.222579492361179e-06, "loss": 0.0, "num_tokens": 999188.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 700 }, { "completion_length": 15.9, "completions/clipped_ratio": 0.0, "completions/max_length": 15.9, "completions/max_terminated_length": 15.9, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.4, "completions/min_terminated_length": 13.4, "epoch": 0.04881738173817382, "frac_reward_zero_std": 1.0, "grad_norm": 8.417941717198119e-05, "kl": 0.9791606456041336, "learning_rate": 3.1666942835298143e-06, "loss": 0.0, "num_tokens": 1013474.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 710 }, { "completion_length": 26.8, "completions/clipped_ratio": 0.0, "completions/max_length": 26.8, "completions/max_terminated_length": 26.8, "completions/mean_length": 21.575, "completions/mean_terminated_length": 21.575, "completions/min_length": 17.1, "completions/min_terminated_length": 17.1, "epoch": 0.04950495049504951, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.561687457934022, "learning_rate": 3.110448047982714e-06, "loss": 0.0, "num_tokens": 1027477.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 720 }, { "completion_length": 25.9, "completions/clipped_ratio": 0.0, "completions/max_length": 25.9, "completions/max_terminated_length": 25.9, "completions/mean_length": 21.275, "completions/mean_terminated_length": 21.275, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.050192519251925194, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7257001161575317, "learning_rate": 3.053871244048669e-06, "loss": 0.0, "num_tokens": 1041216.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 730 }, { "completion_length": 23.3, "completions/clipped_ratio": 0.0, "completions/max_length": 23.3, "completions/max_terminated_length": 23.3, "completions/mean_length": 19.725, "completions/mean_terminated_length": 19.725, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.05088008800880088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9793748699128628, "learning_rate": 2.9969945090650866e-06, "loss": 0.0, "num_tokens": 1055529.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 740 }, { "completion_length": 23.2, "completions/clipped_ratio": 0.0, "completions/max_length": 23.2, "completions/max_terminated_length": 23.2, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.05156765676567657, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.6666584078222513, "learning_rate": 2.9398486427873276e-06, "loss": 0.0, "num_tokens": 1070861.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 750 }, { "completion_length": 18.7, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 16.675, "completions/mean_terminated_length": 16.675, "completions/min_length": 14.4, "completions/min_terminated_length": 14.4, "epoch": 0.052255225522552254, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0366931222379208, "learning_rate": 2.8824645907100957e-06, "loss": 0.0, "num_tokens": 1087072.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 760 }, { "completion_length": 26.8, "completions/clipped_ratio": 0.0, "completions/max_length": 26.8, "completions/max_terminated_length": 26.8, "completions/mean_length": 19.2, "completions/mean_terminated_length": 19.2, "completions/min_length": 14.8, "completions/min_terminated_length": 14.8, "epoch": 0.05294279427942794, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8708533995551988, "learning_rate": 2.824873427309907e-06, "loss": 0.0, "num_tokens": 1102088.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 770 }, { "completion_length": 23.1, "completions/clipped_ratio": 0.0, "completions/max_length": 23.1, "completions/max_terminated_length": 23.1, "completions/mean_length": 18.425, "completions/mean_terminated_length": 18.425, "completions/min_length": 14.4, "completions/min_terminated_length": 14.4, "epoch": 0.05363036303630363, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7705225108191371, "learning_rate": 2.7671063392177133e-06, "loss": 0.0, "num_tokens": 1116397.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 780 }, { "completion_length": 24.8, "completions/clipped_ratio": 0.0, "completions/max_length": 24.8, "completions/max_terminated_length": 24.8, "completions/mean_length": 20.55, "completions/mean_terminated_length": 20.55, "completions/min_length": 17.9, "completions/min_terminated_length": 17.9, "epoch": 0.05431793179317932, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.6953186644241214, "learning_rate": 2.70919460833079e-06, "loss": 0.0, "num_tokens": 1134011.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 790 }, { "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.5, "completions/max_terminated_length": 24.5, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.9, "completions/min_terminated_length": 17.9, "epoch": 0.05500550055005501, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.5938798669725657, "learning_rate": 2.6511695948730357e-06, "loss": 0.0, "num_tokens": 1149273.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 800 }, { "completion_length": 20.3, "completions/clipped_ratio": 0.0, "completions/max_length": 20.3, "completions/max_terminated_length": 20.3, "completions/mean_length": 18.075, "completions/mean_terminated_length": 18.075, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.055693069306930694, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0365911795757712, "learning_rate": 2.593062720412865e-06, "loss": 0.0, "num_tokens": 1162224.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 810 }, { "completion_length": 24.2, "completions/clipped_ratio": 0.0, "completions/max_length": 24.2, "completions/max_terminated_length": 24.2, "completions/mean_length": 20.7, "completions/mean_terminated_length": 20.7, "completions/min_length": 17.7, "completions/min_terminated_length": 17.7, "epoch": 0.05638063806380638, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.6314735591411591, "learning_rate": 2.5349054508478636e-06, "loss": 0.0, "num_tokens": 1178732.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 820 }, { "completion_length": 16.8, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.7, "completions/min_terminated_length": 13.7, "epoch": 0.05706820682068207, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8896382354199887, "learning_rate": 2.4767292793654587e-06, "loss": 0.0, "num_tokens": 1190762.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 830 }, { "completion_length": 20.6, "completions/clipped_ratio": 0.0, "completions/max_length": 20.6, "completions/max_terminated_length": 20.6, "completions/mean_length": 18.125, "completions/mean_terminated_length": 18.125, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.057755775577557754, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0564545184373855, "learning_rate": 2.4185657093887975e-06, "loss": 0.0, "num_tokens": 1205791.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 840 }, { "completion_length": 22.8, "completions/clipped_ratio": 0.0, "completions/max_length": 22.8, "completions/max_terminated_length": 22.8, "completions/mean_length": 19.375, "completions/mean_terminated_length": 19.375, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.05844334433443344, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 1.0678101744502784, "learning_rate": 2.3604462375170905e-06, "loss": 0.0, "num_tokens": 1220030.0, "reward": 4.024999904632568, "reward_std": 0.15, "rewards/coherence_reward_func/mean": 1.2824999570846558, "rewards/coherence_reward_func/std": 0.034999996423721313, "rewards/formatting_reward_func/mean": 1.9625, "rewards/formatting_reward_func/std": 0.075, "rewards/quality_reward_func/mean": 0.7800000131130218, "rewards/quality_reward_func/std": 0.04000000059604645, "step": 850 }, { "completion_length": 21.9, "completions/clipped_ratio": 0.0, "completions/max_length": 21.9, "completions/max_terminated_length": 21.9, "completions/mean_length": 18.625, "completions/mean_terminated_length": 18.625, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.059130913091309134, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8187783338828012, "learning_rate": 2.3024023364696473e-06, "loss": 0.0, "num_tokens": 1234427.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 860 }, { "completion_length": 22.3, "completions/clipped_ratio": 0.0, "completions/max_length": 22.3, "completions/max_terminated_length": 22.3, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.05981848184818482, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.962415424734354, "learning_rate": 2.2444654380428413e-06, "loss": 0.0, "num_tokens": 1246113.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 870 }, { "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.5, "completions/max_terminated_length": 22.5, "completions/mean_length": 19.1, "completions/mean_terminated_length": 19.1, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.06050605060506051, "frac_reward_zero_std": 0.9, "grad_norm": 0.7348126769065857, "kl": 3.9835946768522263, "learning_rate": 2.186666916089239e-06, "loss": 0.0002, "num_tokens": 1262277.0, "reward": 4.087499904632568, "reward_std": 0.025, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.787500011920929, "rewards/quality_reward_func/std": 0.025, "step": 880 }, { "completion_length": 23.6, "completions/clipped_ratio": 0.0, "completions/max_length": 23.6, "completions/max_terminated_length": 23.6, "completions/mean_length": 17.7, "completions/mean_terminated_length": 17.7, "completions/min_length": 14.3, "completions/min_terminated_length": 14.3, "epoch": 0.061193619361936194, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8025905980728567, "learning_rate": 2.1290380695281083e-06, "loss": 0.0, "num_tokens": 1276361.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 890 }, { "completion_length": 21.6, "completions/clipped_ratio": 0.0, "completions/max_length": 21.6, "completions/max_terminated_length": 21.6, "completions/mean_length": 18.475, "completions/mean_terminated_length": 18.475, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.06188118811881188, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 0.5589495476335287, "learning_rate": 2.0716101053964965e-06, "loss": 0.0, "num_tokens": 1290076.0, "reward": 4.077499914169311, "reward_std": 0.01499999314546585, "rewards/coherence_reward_func/mean": 1.2774999618530274, "rewards/coherence_reward_func/std": 0.01499999761581421, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 900 }, { "completion_length": 19.4, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 17.225, "completions/mean_terminated_length": 17.225, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.06256875687568757, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0769402489066124, "learning_rate": 2.0144141219500707e-06, "loss": 0.0, "num_tokens": 1304321.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 910 }, { "completion_length": 24.2, "completions/clipped_ratio": 0.0, "completions/max_length": 24.2, "completions/max_terminated_length": 24.2, "completions/mean_length": 20.775, "completions/mean_terminated_length": 20.775, "completions/min_length": 17.4, "completions/min_terminated_length": 17.4, "epoch": 0.06325632563256325, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7756512950756587, "learning_rate": 1.9574810918228667e-06, "loss": 0.0, "num_tokens": 1318764.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 920 }, { "completion_length": 28.2, "completions/clipped_ratio": 0.0, "completions/max_length": 28.2, "completions/max_terminated_length": 28.2, "completions/mean_length": 24.375, "completions/mean_terminated_length": 24.375, "completions/min_length": 21.1, "completions/min_terminated_length": 21.1, "epoch": 0.06394389438943894, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.987953900906723, "learning_rate": 1.9008418452550579e-06, "loss": 0.0, "num_tokens": 1332931.0, "reward": 3.9799998998641968, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2299999594688416, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 1.95, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 930 }, { "completion_length": 21.2, "completions/clipped_ratio": 0.0, "completions/max_length": 21.2, "completions/max_terminated_length": 21.2, "completions/mean_length": 18.475, "completions/mean_terminated_length": 18.475, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.06463146314631463, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7319877591449767, "learning_rate": 1.8445270533978387e-06, "loss": 0.0, "num_tokens": 1344754.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 940 }, { "completion_length": 22.2, "completions/clipped_ratio": 0.0, "completions/max_length": 22.2, "completions/max_terminated_length": 22.2, "completions/mean_length": 18.4, "completions/mean_terminated_length": 18.4, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.06531903190319031, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8628140719607472, "learning_rate": 1.788567211704453e-06, "loss": 0.0, "num_tokens": 1359610.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 950 }, { "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.8, "completions/min_terminated_length": 13.8, "epoch": 0.066006600660066, "frac_reward_zero_std": 0.9, "grad_norm": 2.449816201988142e-05, "kl": 0.7857629887759685, "learning_rate": 1.7329926234163694e-06, "loss": 0.0, "num_tokens": 1373758.0, "reward": 4.087499904632568, "reward_std": 0.025, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.787500011920929, "rewards/quality_reward_func/std": 0.025, "step": 960 }, { "completion_length": 22.3, "completions/clipped_ratio": 0.0, "completions/max_length": 22.3, "completions/max_terminated_length": 22.3, "completions/mean_length": 19.05, "completions/mean_terminated_length": 19.05, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.0666941694169417, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.736820934060961, "learning_rate": 1.677833383153542e-06, "loss": 0.0, "num_tokens": 1387248.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 970 }, { "completion_length": 22.6, "completions/clipped_ratio": 0.0, "completions/max_length": 22.6, "completions/max_terminated_length": 22.6, "completions/mean_length": 18.175, "completions/mean_terminated_length": 18.175, "completions/min_length": 14.6, "completions/min_terminated_length": 14.6, "epoch": 0.06738173817381739, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.5687850050628185, "learning_rate": 1.6231193606176415e-06, "loss": 0.0, "num_tokens": 1402547.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 980 }, { "completion_length": 21.2, "completions/clipped_ratio": 0.0, "completions/max_length": 21.2, "completions/max_terminated_length": 21.2, "completions/mean_length": 18.025, "completions/mean_terminated_length": 18.025, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.06806930693069307, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.82333648793865, "learning_rate": 1.5688801844170846e-06, "loss": 0.0, "num_tokens": 1416408.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 990 }, { "completion_length": 23.7, "completions/clipped_ratio": 0.0, "completions/max_length": 23.7, "completions/max_terminated_length": 23.7, "completions/mean_length": 19.85, "completions/mean_terminated_length": 19.85, "completions/min_length": 17.1, "completions/min_terminated_length": 17.1, "epoch": 0.06875687568756876, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.819248874951154, "learning_rate": 1.5151452260226224e-06, "loss": 0.0, "num_tokens": 1429986.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1000 }, { "completion_length": 20.2, "completions/clipped_ratio": 0.0, "completions/max_length": 20.2, "completions/max_terminated_length": 20.2, "completions/mean_length": 17.6, "completions/mean_terminated_length": 17.6, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.06944444444444445, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7394288497045636, "learning_rate": 1.4619435838621677e-06, "loss": 0.0, "num_tokens": 1445078.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1010 }, { "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.5, "completions/max_terminated_length": 23.5, "completions/mean_length": 20.9, "completions/mean_terminated_length": 20.9, "completions/min_length": 17.6, "completions/min_terminated_length": 17.6, "epoch": 0.07013201320132013, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.5150950760114938, "learning_rate": 1.4093040675634834e-06, "loss": 0.0, "num_tokens": 1460718.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1020 }, { "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.525, "completions/mean_terminated_length": 18.525, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.07081958195819582, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8705919556319713, "learning_rate": 1.3572551823532654e-06, "loss": 0.0, "num_tokens": 1474799.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1030 }, { "completion_length": 20.4, "completions/clipped_ratio": 0.0, "completions/max_length": 20.4, "completions/max_terminated_length": 20.4, "completions/mean_length": 17.7, "completions/mean_terminated_length": 17.7, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.07150715071507151, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7958902813494205, "learning_rate": 1.305825113621051e-06, "loss": 0.0, "num_tokens": 1487611.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1040 }, { "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.5, "completions/max_terminated_length": 22.5, "completions/mean_length": 19.075, "completions/mean_terminated_length": 19.075, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.0721947194719472, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.6268668082542718, "learning_rate": 1.2550417116563413e-06, "loss": 0.0, "num_tokens": 1501134.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1050 }, { "completion_length": 22.2, "completions/clipped_ratio": 0.0, "completions/max_length": 22.2, "completions/max_terminated_length": 22.2, "completions/mean_length": 18.45, "completions/mean_terminated_length": 18.45, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.07288228822882288, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7750688172876835, "learning_rate": 1.204932476567175e-06, "loss": 0.0, "num_tokens": 1515356.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1060 }, { "completion_length": 17.8, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 15.325, "completions/mean_terminated_length": 15.325, "completions/min_length": 13.6, "completions/min_terminated_length": 13.6, "epoch": 0.07356985698569857, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 0.8960678532719613, "learning_rate": 1.1555245433883322e-06, "loss": 0.0, "num_tokens": 1529909.0, "reward": 4.087499904632568, "reward_std": 0.025, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.787500011920929, "rewards/quality_reward_func/std": 0.025, "step": 1070 }, { "completion_length": 22.9, "completions/clipped_ratio": 0.0, "completions/max_length": 22.9, "completions/max_terminated_length": 22.9, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.07425742574257425, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7110856842249632, "learning_rate": 1.1068446673872394e-06, "loss": 0.0, "num_tokens": 1545037.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1080 }, { "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.15, "completions/mean_terminated_length": 19.15, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.07494499449944994, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8430487784557045, "learning_rate": 1.0589192095755172e-06, "loss": 0.0, "num_tokens": 1560495.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1090 }, { "completion_length": 19.9, "completions/clipped_ratio": 0.0, "completions/max_length": 19.9, "completions/max_terminated_length": 19.9, "completions/mean_length": 17.825, "completions/mean_terminated_length": 17.825, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.07563256325632563, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8941279411315918, "learning_rate": 1.0117741224340255e-06, "loss": 0.0, "num_tokens": 1575040.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1100 }, { "completion_length": 20.1, "completions/clipped_ratio": 0.0, "completions/max_length": 20.1, "completions/max_terminated_length": 20.1, "completions/mean_length": 17.175, "completions/mean_terminated_length": 17.175, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.07632013201320131, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8955685901921242, "learning_rate": 9.654349358591437e-07, "loss": 0.0, "num_tokens": 1590247.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1110 }, { "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.425, "completions/mean_terminated_length": 19.425, "completions/min_length": 17.7, "completions/min_terminated_length": 17.7, "epoch": 0.07700770077007701, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.836240575928241, "learning_rate": 9.199267433378728e-07, "loss": 0.0, "num_tokens": 1606988.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1120 }, { "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 17.625, "completions/mean_terminated_length": 17.625, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.0776952695269527, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8303040105034597, "learning_rate": 8.752741883592792e-07, "loss": 0.0, "num_tokens": 1622165.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1130 }, { "completion_length": 21.7, "completions/clipped_ratio": 0.0, "completions/max_length": 21.7, "completions/max_terminated_length": 21.7, "completions/mean_length": 17.95, "completions/mean_terminated_length": 17.95, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.07838283828382839, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9129018164705485, "learning_rate": 8.315014510696004e-07, "loss": 0.0, "num_tokens": 1637579.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1140 }, { "completion_length": 18.1, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 16.95, "completions/mean_terminated_length": 16.95, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.07907040704070407, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 0.8532706722617149, "learning_rate": 7.886322351782782e-07, "loss": 0.0, "num_tokens": 1650761.0, "reward": 4.074999904632568, "reward_std": 0.028867512941360474, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.7750000119209289, "rewards/quality_reward_func/std": 0.028867512941360474, "step": 1150 }, { "completion_length": 19.7, "completions/clipped_ratio": 0.0, "completions/max_length": 19.7, "completions/max_terminated_length": 19.7, "completions/mean_length": 16.925, "completions/mean_terminated_length": 16.925, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.07975797579757976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8230630304664374, "learning_rate": 7.466897551219779e-07, "loss": 0.0, "num_tokens": 1666838.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1160 }, { "completion_length": 22.4, "completions/clipped_ratio": 0.0, "completions/max_length": 22.4, "completions/max_terminated_length": 22.4, "completions/mean_length": 18.775, "completions/mean_terminated_length": 18.775, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.08044554455445545, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8656837910413742, "learning_rate": 7.056967234935583e-07, "loss": 0.0, "num_tokens": 1681949.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1170 }, { "completion_length": 24.8, "completions/clipped_ratio": 0.0, "completions/max_length": 24.8, "completions/max_terminated_length": 24.8, "completions/mean_length": 21.35, "completions/mean_terminated_length": 21.35, "completions/min_length": 17.9, "completions/min_terminated_length": 17.9, "epoch": 0.08113311331133113, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007500413339585066, "kl": 0.5578598533757031, "learning_rate": 6.656753387428089e-07, "loss": 0.0, "num_tokens": 1694679.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1180 }, { "completion_length": 23.2, "completions/clipped_ratio": 0.0, "completions/max_length": 23.2, "completions/max_terminated_length": 23.2, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.08182068206820682, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.6424278654158115, "learning_rate": 6.266472731555928e-07, "loss": 0.0, "num_tokens": 1710533.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1190 }, { "completion_length": 22.3, "completions/clipped_ratio": 0.0, "completions/max_length": 22.3, "completions/max_terminated_length": 22.3, "completions/mean_length": 18.575, "completions/mean_terminated_length": 18.575, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.08250825082508251, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7231093045324087, "learning_rate": 5.886336611179211e-07, "loss": 0.0, "num_tokens": 1725716.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1200 }, { "completion_length": 21.4, "completions/clipped_ratio": 0.0, "completions/max_length": 21.4, "completions/max_terminated_length": 21.4, "completions/mean_length": 18.2, "completions/mean_terminated_length": 18.2, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.0831958195819582, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.6847004944458603, "learning_rate": 4.154639175257732e-06, "loss": 0.0, "num_tokens": 1740952.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1210 }, { "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.45, "completions/mean_terminated_length": 23.45, "completions/min_length": 19.1, "completions/min_terminated_length": 19.1, "epoch": 0.08388338833883388, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.5057500394526869, "learning_rate": 4.189003436426117e-06, "loss": 0.0, "num_tokens": 1756770.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1220 }, { "completion_length": 21.7, "completions/clipped_ratio": 0.0, "completions/max_length": 21.7, "completions/max_terminated_length": 21.7, "completions/mean_length": 17.825, "completions/mean_terminated_length": 17.825, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.08457095709570957, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7115267224609851, "learning_rate": 4.223367697594502e-06, "loss": 0.0, "num_tokens": 1771891.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1230 }, { "completion_length": 21.2, "completions/clipped_ratio": 0.0, "completions/max_length": 21.2, "completions/max_terminated_length": 21.2, "completions/mean_length": 18.725, "completions/mean_terminated_length": 18.725, "completions/min_length": 16.9, "completions/min_terminated_length": 16.9, "epoch": 0.08525852585258525, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8612367749214173, "learning_rate": 4.257731958762887e-06, "loss": 0.0, "num_tokens": 1786328.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1240 }, { "completion_length": 23.1, "completions/clipped_ratio": 0.0, "completions/max_length": 23.1, "completions/max_terminated_length": 23.1, "completions/mean_length": 19.1, "completions/mean_terminated_length": 19.1, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.08594609460946094, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7600005578249693, "learning_rate": 4.292096219931272e-06, "loss": 0.0, "num_tokens": 1800544.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1250 }, { "completion_length": 21.9, "completions/clipped_ratio": 0.0, "completions/max_length": 21.9, "completions/max_terminated_length": 21.9, "completions/mean_length": 19.4, "completions/mean_terminated_length": 19.4, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.08663366336633663, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.6648030929267407, "learning_rate": 4.326460481099657e-06, "loss": 0.0, "num_tokens": 1813988.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1260 }, { "completion_length": 20.2, "completions/clipped_ratio": 0.0, "completions/max_length": 20.2, "completions/max_terminated_length": 20.2, "completions/mean_length": 17.35, "completions/mean_terminated_length": 17.35, "completions/min_length": 13.5, "completions/min_terminated_length": 13.5, "epoch": 0.08732123212321233, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 872.6091574197635, "learning_rate": 4.360824742268042e-06, "loss": 0.0426, "num_tokens": 1825918.0, "reward": 3.9974998950958254, "reward_std": 0.20499999523162843, "rewards/coherence_reward_func/mean": 1.267499953508377, "rewards/coherence_reward_func/std": 0.06499999761581421, "rewards/formatting_reward_func/mean": 1.95, "rewards/formatting_reward_func/std": 0.1, "rewards/quality_reward_func/mean": 0.7800000131130218, "rewards/quality_reward_func/std": 0.04000000059604645, "step": 1270 }, { "completion_length": 19.9, "completions/clipped_ratio": 0.0, "completions/max_length": 19.9, "completions/max_terminated_length": 19.9, "completions/mean_length": 17.35, "completions/mean_terminated_length": 17.35, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.08800880088008801, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9469352498650551, "learning_rate": 4.395189003436426e-06, "loss": 0.0, "num_tokens": 1838528.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1280 }, { "completion_length": 21.7, "completions/clipped_ratio": 0.0, "completions/max_length": 21.7, "completions/max_terminated_length": 21.7, "completions/mean_length": 18.625, "completions/mean_terminated_length": 18.625, "completions/min_length": 16.7, "completions/min_terminated_length": 16.7, "epoch": 0.0886963696369637, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8693209359422326, "learning_rate": 4.429553264604811e-06, "loss": 0.0, "num_tokens": 1851685.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1290 }, { "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.475, "completions/mean_terminated_length": 18.475, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.08938393839383939, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.5836437934543938, "learning_rate": 4.463917525773197e-06, "loss": 0.0, "num_tokens": 1864032.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1300 }, { "completion_length": 22.1, "completions/clipped_ratio": 0.0, "completions/max_length": 22.1, "completions/max_terminated_length": 22.1, "completions/mean_length": 19.3, "completions/mean_terminated_length": 19.3, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.09007150715071507, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 1.0852753214538098, "learning_rate": 4.498281786941581e-06, "loss": 0.0, "num_tokens": 1879600.0, "reward": 4.087499904632568, "reward_std": 0.025, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.787500011920929, "rewards/quality_reward_func/std": 0.025, "step": 1310 }, { "completion_length": 23.9, "completions/clipped_ratio": 0.0, "completions/max_length": 23.9, "completions/max_terminated_length": 23.9, "completions/mean_length": 20.65, "completions/mean_terminated_length": 20.65, "completions/min_length": 17.9, "completions/min_terminated_length": 17.9, "epoch": 0.09075907590759076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.6119156501255929, "learning_rate": 4.532646048109966e-06, "loss": 0.0, "num_tokens": 1895626.0, "reward": 4.06999990940094, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2699999570846559, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1320 }, { "completion_length": 16.8, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 15.1, "completions/mean_terminated_length": 15.1, "completions/min_length": 13.5, "completions/min_terminated_length": 13.5, "epoch": 0.09144664466446645, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2755441337823867, "learning_rate": 4.567010309278351e-06, "loss": 0.0, "num_tokens": 1910970.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1330 }, { "completion_length": 22.8, "completions/clipped_ratio": 0.0, "completions/max_length": 22.8, "completions/max_terminated_length": 22.8, "completions/mean_length": 19.2, "completions/mean_terminated_length": 19.2, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.09213421342134213, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.6432308156974613, "learning_rate": 4.601374570446736e-06, "loss": 0.0, "num_tokens": 1925750.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1340 }, { "completion_length": 23.4, "completions/clipped_ratio": 0.0, "completions/max_length": 23.4, "completions/max_terminated_length": 23.4, "completions/mean_length": 19.475, "completions/mean_terminated_length": 19.475, "completions/min_length": 16.9, "completions/min_terminated_length": 16.9, "epoch": 0.09282178217821782, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7006123065948486, "learning_rate": 4.635738831615121e-06, "loss": 0.0, "num_tokens": 1939813.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1350 }, { "completion_length": 27.6, "completions/clipped_ratio": 0.0, "completions/max_length": 27.6, "completions/max_terminated_length": 27.6, "completions/mean_length": 22.375, "completions/mean_terminated_length": 22.375, "completions/min_length": 16.7, "completions/min_terminated_length": 16.7, "epoch": 0.09350935093509351, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 0.7452327590435743, "learning_rate": 4.670103092783506e-06, "loss": 0.0, "num_tokens": 1952404.0, "reward": 4.092499876022339, "reward_std": 0.015000002086162567, "rewards/coherence_reward_func/mean": 1.2924999475479126, "rewards/coherence_reward_func/std": 0.01499999761581421, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1360 }, { "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.475, "completions/mean_terminated_length": 21.475, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.0941969196919692, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8505723314359784, "learning_rate": 4.704467353951891e-06, "loss": 0.0, "num_tokens": 1967187.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1370 }, { "completion_length": 24.8, "completions/clipped_ratio": 0.0, "completions/max_length": 24.8, "completions/max_terminated_length": 24.8, "completions/mean_length": 21.625, "completions/mean_terminated_length": 21.625, "completions/min_length": 18.4, "completions/min_terminated_length": 18.4, "epoch": 0.09488448844884488, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8517201460897923, "learning_rate": 4.738831615120275e-06, "loss": 0.0, "num_tokens": 1982672.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1380 }, { "completion_length": 24.2, "completions/clipped_ratio": 0.0, "completions/max_length": 24.2, "completions/max_terminated_length": 24.2, "completions/mean_length": 21.525, "completions/mean_terminated_length": 21.525, "completions/min_length": 19.5, "completions/min_terminated_length": 19.5, "epoch": 0.09557205720572057, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7543423019349575, "learning_rate": 4.7731958762886605e-06, "loss": 0.0, "num_tokens": 1996669.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1390 }, { "completion_length": 23.4, "completions/clipped_ratio": 0.0, "completions/max_length": 23.4, "completions/max_terminated_length": 23.4, "completions/mean_length": 19.35, "completions/mean_terminated_length": 19.35, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.09625962596259625, "frac_reward_zero_std": 1.0, "grad_norm": 0.00021786447905469686, "kl": 0.8556835754774511, "learning_rate": 4.8075601374570455e-06, "loss": 0.0, "num_tokens": 2011723.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1400 }, { "completion_length": 24.6, "completions/clipped_ratio": 0.0, "completions/max_length": 24.6, "completions/max_terminated_length": 24.6, "completions/mean_length": 21.225, "completions/mean_terminated_length": 21.225, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.09694719471947194, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7849578246474266, "learning_rate": 4.84192439862543e-06, "loss": 0.0, "num_tokens": 2024768.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1410 }, { "completion_length": 22.9, "completions/clipped_ratio": 0.0, "completions/max_length": 22.9, "completions/max_terminated_length": 22.9, "completions/mean_length": 20.025, "completions/mean_terminated_length": 20.025, "completions/min_length": 17.6, "completions/min_terminated_length": 17.6, "epoch": 0.09763476347634764, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9401213183999062, "learning_rate": 4.8762886597938146e-06, "loss": 0.0, "num_tokens": 2040337.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1420 }, { "completion_length": 25.7, "completions/clipped_ratio": 0.0, "completions/max_length": 25.7, "completions/max_terminated_length": 25.7, "completions/mean_length": 22.525, "completions/mean_terminated_length": 22.525, "completions/min_length": 19.5, "completions/min_terminated_length": 19.5, "epoch": 0.09832233223322333, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.678148178756237, "learning_rate": 4.9106529209621995e-06, "loss": 0.0, "num_tokens": 2056182.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1430 }, { "completion_length": 23.4, "completions/clipped_ratio": 0.0, "completions/max_length": 23.4, "completions/max_terminated_length": 23.4, "completions/mean_length": 20.475, "completions/mean_terminated_length": 20.475, "completions/min_length": 17.8, "completions/min_terminated_length": 17.8, "epoch": 0.09900990099009901, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7785345762968063, "learning_rate": 4.9450171821305845e-06, "loss": 0.0, "num_tokens": 2068793.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1440 }, { "completion_length": 24.4, "completions/clipped_ratio": 0.0, "completions/max_length": 24.4, "completions/max_terminated_length": 24.4, "completions/mean_length": 20.775, "completions/mean_terminated_length": 20.775, "completions/min_length": 17.9, "completions/min_terminated_length": 17.9, "epoch": 0.0996974697469747, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8162919467315077, "learning_rate": 4.9793814432989694e-06, "loss": 0.0, "num_tokens": 2083912.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1450 }, { "completion_length": 24.7, "completions/clipped_ratio": 0.0, "completions/max_length": 24.7, "completions/max_terminated_length": 24.7, "completions/mean_length": 20.95, "completions/mean_terminated_length": 20.95, "completions/min_length": 17.8, "completions/min_terminated_length": 17.8, "epoch": 0.10038503850385039, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.6797867778688669, "learning_rate": 4.9999988478294445e-06, "loss": 0.0, "num_tokens": 2096974.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1460 }, { "completion_length": 21.9, "completions/clipped_ratio": 0.0, "completions/max_length": 21.9, "completions/max_terminated_length": 21.9, "completions/mean_length": 19.725, "completions/mean_terminated_length": 19.725, "completions/min_length": 17.8, "completions/min_terminated_length": 17.8, "epoch": 0.10107260726072607, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9316600441932679, "learning_rate": 4.999985885922894e-06, "loss": 0.0, "num_tokens": 2110847.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1470 }, { "completion_length": 23.2, "completions/clipped_ratio": 0.0, "completions/max_length": 23.2, "completions/max_terminated_length": 23.2, "completions/mean_length": 19.7, "completions/mean_terminated_length": 19.7, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.10176017601760176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0361852012574673, "learning_rate": 4.999958521971518e-06, "loss": 0.0, "num_tokens": 2125615.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1480 }, { "completion_length": 22.6, "completions/clipped_ratio": 0.0, "completions/max_length": 22.6, "completions/max_terminated_length": 22.6, "completions/mean_length": 19.7, "completions/mean_terminated_length": 19.7, "completions/min_length": 17.2, "completions/min_terminated_length": 17.2, "epoch": 0.10244774477447745, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9184820102527738, "learning_rate": 4.999916756132957e-06, "loss": 0.0, "num_tokens": 2140815.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1490 }, { "completion_length": 23.1, "completions/clipped_ratio": 0.0, "completions/max_length": 23.1, "completions/max_terminated_length": 23.1, "completions/mean_length": 19.9, "completions/mean_terminated_length": 19.9, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.10313531353135313, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.796672991476953, "learning_rate": 4.999860588647817e-06, "loss": 0.0, "num_tokens": 2153719.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1500 }, { "completion_length": 22.4, "completions/clipped_ratio": 0.0, "completions/max_length": 22.4, "completions/max_terminated_length": 22.4, "completions/mean_length": 19.7, "completions/mean_terminated_length": 19.7, "completions/min_length": 17.3, "completions/min_terminated_length": 17.3, "epoch": 0.10382288228822882, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9858798503875732, "learning_rate": 4.999790019839672e-06, "loss": 0.0, "num_tokens": 2168859.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1510 }, { "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.5, "completions/max_terminated_length": 23.5, "completions/mean_length": 20.45, "completions/mean_terminated_length": 20.45, "completions/min_length": 16.7, "completions/min_terminated_length": 16.7, "epoch": 0.10451045104510451, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.82629788313061, "learning_rate": 4.999705050115057e-06, "loss": 0.0, "num_tokens": 2186221.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1520 }, { "completion_length": 25.8, "completions/clipped_ratio": 0.0, "completions/max_length": 25.8, "completions/max_terminated_length": 25.8, "completions/mean_length": 22.15, "completions/mean_terminated_length": 22.15, "completions/min_length": 18.3, "completions/min_terminated_length": 18.3, "epoch": 0.1051980198019802, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.6753423534333706, "learning_rate": 4.99960567996347e-06, "loss": 0.0, "num_tokens": 2200983.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1530 }, { "completion_length": 24.3, "completions/clipped_ratio": 0.0, "completions/max_length": 24.3, "completions/max_terminated_length": 24.3, "completions/mean_length": 22.425, "completions/mean_terminated_length": 22.425, "completions/min_length": 20.1, "completions/min_terminated_length": 20.1, "epoch": 0.10588558855885588, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7230565965175628, "learning_rate": 4.999491909957368e-06, "loss": 0.0, "num_tokens": 2216356.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1540 }, { "completion_length": 26.4, "completions/clipped_ratio": 0.0, "completions/max_length": 26.4, "completions/max_terminated_length": 26.4, "completions/mean_length": 22.35, "completions/mean_terminated_length": 22.35, "completions/min_length": 18.9, "completions/min_terminated_length": 18.9, "epoch": 0.10657315731573157, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8518215868622064, "learning_rate": 4.999363740752162e-06, "loss": 0.0, "num_tokens": 2233090.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1550 }, { "completion_length": 20.8, "completions/clipped_ratio": 0.0, "completions/max_length": 20.8, "completions/max_terminated_length": 20.8, "completions/mean_length": 17.8, "completions/mean_terminated_length": 17.8, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.10726072607260725, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 0.9032241486944258, "learning_rate": 4.999221173086218e-06, "loss": 0.0, "num_tokens": 2245678.0, "reward": 4.06999990940094, "reward_std": 0.059999996423721315, "rewards/coherence_reward_func/mean": 1.2824999570846558, "rewards/coherence_reward_func/std": 0.034999996423721313, "rewards/formatting_reward_func/mean": 1.9875, "rewards/formatting_reward_func/std": 0.025, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1560 }, { "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.125, "completions/mean_terminated_length": 23.125, "completions/min_length": 18.8, "completions/min_terminated_length": 18.8, "epoch": 0.10794829482948295, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7683502836152911, "learning_rate": 4.9990642077808445e-06, "loss": 0.0, "num_tokens": 2260679.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1570 }, { "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.975, "completions/mean_terminated_length": 20.975, "completions/min_length": 18.9, "completions/min_terminated_length": 18.9, "epoch": 0.10863586358635864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8008473014459014, "learning_rate": 4.9988928457402965e-06, "loss": 0.0, "num_tokens": 2273774.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1580 }, { "completion_length": 24.2, "completions/clipped_ratio": 0.0, "completions/max_length": 24.2, "completions/max_terminated_length": 24.2, "completions/mean_length": 20.6, "completions/mean_terminated_length": 20.6, "completions/min_length": 17.9, "completions/min_terminated_length": 17.9, "epoch": 0.10932343234323433, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8917723886668683, "learning_rate": 4.998707087951764e-06, "loss": 0.0, "num_tokens": 2290346.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1590 }, { "completion_length": 26.9, "completions/clipped_ratio": 0.0, "completions/max_length": 26.9, "completions/max_terminated_length": 26.9, "completions/mean_length": 22.675, "completions/mean_terminated_length": 22.675, "completions/min_length": 19.5, "completions/min_terminated_length": 19.5, "epoch": 0.11001100110011001, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7571378488093614, "learning_rate": 4.998506935485372e-06, "loss": 0.0, "num_tokens": 2304585.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1600 }, { "completion_length": 26.6, "completions/clipped_ratio": 0.0, "completions/max_length": 26.6, "completions/max_terminated_length": 26.6, "completions/mean_length": 22.725, "completions/mean_terminated_length": 22.725, "completions/min_length": 19.3, "completions/min_terminated_length": 19.3, "epoch": 0.1106985698569857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7093593463301658, "learning_rate": 4.998292389494166e-06, "loss": 0.0, "num_tokens": 2316902.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1610 }, { "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 25.2, "completions/mean_terminated_length": 25.2, "completions/min_length": 22.2, "completions/min_terminated_length": 22.2, "epoch": 0.11138613861386139, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 4.4980769870104265, "learning_rate": 4.998063451214116e-06, "loss": 0.0007, "num_tokens": 2330646.0, "reward": 4.002499914169311, "reward_std": 0.045000004768371585, "rewards/coherence_reward_func/mean": 1.2399999618530273, "rewards/coherence_reward_func/std": 0.020000000298023225, "rewards/formatting_reward_func/mean": 1.9625, "rewards/formatting_reward_func/std": 0.025, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1620 }, { "completion_length": 22.8, "completions/clipped_ratio": 0.0, "completions/max_length": 22.8, "completions/max_terminated_length": 22.8, "completions/mean_length": 19.9, "completions/mean_terminated_length": 19.9, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.11207370737073707, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9067626856267452, "learning_rate": 4.9978201219641e-06, "loss": 0.0, "num_tokens": 2345394.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1630 }, { "completion_length": 24.1, "completions/clipped_ratio": 0.0, "completions/max_length": 24.1, "completions/max_terminated_length": 24.1, "completions/mean_length": 21.725, "completions/mean_terminated_length": 21.725, "completions/min_length": 19.4, "completions/min_terminated_length": 19.4, "epoch": 0.11276127612761276, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.6573985446244478, "learning_rate": 4.9975624031459e-06, "loss": 0.0, "num_tokens": 2359587.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1640 }, { "completion_length": 22.7, "completions/clipped_ratio": 0.0, "completions/max_length": 22.7, "completions/max_terminated_length": 22.7, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.7, "completions/min_terminated_length": 17.7, "epoch": 0.11344884488448845, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7965472146868706, "learning_rate": 4.997290296244199e-06, "loss": 0.0, "num_tokens": 2374833.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1650 }, { "completion_length": 27.9, "completions/clipped_ratio": 0.0, "completions/max_length": 27.9, "completions/max_terminated_length": 27.9, "completions/mean_length": 21.975, "completions/mean_terminated_length": 21.975, "completions/min_length": 18.2, "completions/min_terminated_length": 18.2, "epoch": 0.11413641364136413, "frac_reward_zero_std": 0.9, "grad_norm": 0.3535076081752777, "kl": 0.672401818819344, "learning_rate": 4.997003802826561e-06, "loss": 0.0, "num_tokens": 2387288.0, "reward": 4.06999990940094, "reward_std": 0.059999996423721315, "rewards/coherence_reward_func/mean": 1.2824999570846558, "rewards/coherence_reward_func/std": 0.034999996423721313, "rewards/formatting_reward_func/mean": 1.9875, "rewards/formatting_reward_func/std": 0.025, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1660 }, { "completion_length": 25.9, "completions/clipped_ratio": 0.0, "completions/max_length": 25.9, "completions/max_terminated_length": 25.9, "completions/mean_length": 21.55, "completions/mean_terminated_length": 21.55, "completions/min_length": 18.3, "completions/min_terminated_length": 18.3, "epoch": 0.11482398239823982, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7575838030315936, "learning_rate": 4.996702924543433e-06, "loss": 0.0, "num_tokens": 2402070.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1670 }, { "completion_length": 23.2, "completions/clipped_ratio": 0.0, "completions/max_length": 23.2, "completions/max_terminated_length": 23.2, "completions/mean_length": 20.45, "completions/mean_terminated_length": 20.45, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.11551155115511551, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7976297162473202, "learning_rate": 4.996387663128131e-06, "loss": 0.0, "num_tokens": 2415944.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1680 }, { "completion_length": 21.7, "completions/clipped_ratio": 0.0, "completions/max_length": 21.7, "completions/max_terminated_length": 21.7, "completions/mean_length": 19.275, "completions/mean_terminated_length": 19.275, "completions/min_length": 17.4, "completions/min_terminated_length": 17.4, "epoch": 0.1161991199119912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8864114709198475, "learning_rate": 4.996058020396826e-06, "loss": 0.0, "num_tokens": 2430339.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1690 }, { "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.025, "completions/mean_terminated_length": 21.025, "completions/min_length": 18.4, "completions/min_terminated_length": 18.4, "epoch": 0.11688668866886688, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0341297045350075, "learning_rate": 4.995713998248543e-06, "loss": 0.0, "num_tokens": 2446412.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1700 }, { "completion_length": 23.2, "completions/clipped_ratio": 0.0, "completions/max_length": 23.2, "completions/max_terminated_length": 23.2, "completions/mean_length": 19.925, "completions/mean_terminated_length": 19.925, "completions/min_length": 17.3, "completions/min_terminated_length": 17.3, "epoch": 0.11757425742574257, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8104997374117374, "learning_rate": 4.99535559866514e-06, "loss": 0.0, "num_tokens": 2461921.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1710 }, { "completion_length": 24.1, "completions/clipped_ratio": 0.0, "completions/max_length": 24.1, "completions/max_terminated_length": 24.1, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 18.7, "completions/min_terminated_length": 18.7, "epoch": 0.11826182618261827, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7497173383831978, "learning_rate": 4.994982823711306e-06, "loss": 0.0, "num_tokens": 2477327.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1720 }, { "completion_length": 23.7, "completions/clipped_ratio": 0.0, "completions/max_length": 23.7, "completions/max_terminated_length": 23.7, "completions/mean_length": 20.925, "completions/mean_terminated_length": 20.925, "completions/min_length": 17.7, "completions/min_terminated_length": 17.7, "epoch": 0.11894939493949395, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.870328263938427, "learning_rate": 4.99459567553454e-06, "loss": 0.0, "num_tokens": 2491144.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1730 }, { "completion_length": 22.6, "completions/clipped_ratio": 0.0, "completions/max_length": 22.6, "completions/max_terminated_length": 22.6, "completions/mean_length": 19.025, "completions/mean_terminated_length": 19.025, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.11963696369636964, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8187686018645763, "learning_rate": 4.994194156365145e-06, "loss": 0.0, "num_tokens": 2504629.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1740 }, { "completion_length": 27.6, "completions/clipped_ratio": 0.0, "completions/max_length": 27.6, "completions/max_terminated_length": 27.6, "completions/mean_length": 22.225, "completions/mean_terminated_length": 22.225, "completions/min_length": 18.2, "completions/min_terminated_length": 18.2, "epoch": 0.12032453245324533, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.6447806656360626, "learning_rate": 4.993778268516213e-06, "loss": 0.0, "num_tokens": 2519430.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1750 }, { "completion_length": 22.7, "completions/clipped_ratio": 0.0, "completions/max_length": 22.7, "completions/max_terminated_length": 22.7, "completions/mean_length": 19.15, "completions/mean_terminated_length": 19.15, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.12101210121012101, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8798821037635207, "learning_rate": 4.993348014383612e-06, "loss": 0.0, "num_tokens": 2533524.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1760 }, { "completion_length": 25.1, "completions/clipped_ratio": 0.0, "completions/max_length": 25.1, "completions/max_terminated_length": 25.1, "completions/mean_length": 21.9, "completions/mean_terminated_length": 21.9, "completions/min_length": 18.9, "completions/min_terminated_length": 18.9, "epoch": 0.1216996699669967, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7982191428542137, "learning_rate": 4.9929033964459714e-06, "loss": 0.0, "num_tokens": 2546472.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1770 }, { "completion_length": 25.1, "completions/clipped_ratio": 0.0, "completions/max_length": 25.1, "completions/max_terminated_length": 25.1, "completions/mean_length": 21.025, "completions/mean_terminated_length": 21.025, "completions/min_length": 17.1, "completions/min_terminated_length": 17.1, "epoch": 0.12238723872387239, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.767993874847889, "learning_rate": 4.992444417264668e-06, "loss": 0.0, "num_tokens": 2560665.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1780 }, { "completion_length": 25.2, "completions/clipped_ratio": 0.0, "completions/max_length": 25.2, "completions/max_terminated_length": 25.2, "completions/mean_length": 22.175, "completions/mean_terminated_length": 22.175, "completions/min_length": 18.6, "completions/min_terminated_length": 18.6, "epoch": 0.12307480748074807, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8567698059370741, "learning_rate": 4.991971079483814e-06, "loss": 0.0, "num_tokens": 2576304.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1790 }, { "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 16.7, "completions/min_terminated_length": 16.7, "epoch": 0.12376237623762376, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8909156315028668, "learning_rate": 4.991483385830236e-06, "loss": 0.0, "num_tokens": 2590680.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1800 }, { "completion_length": 25.3, "completions/clipped_ratio": 0.0, "completions/max_length": 25.3, "completions/max_terminated_length": 25.3, "completions/mean_length": 21.45, "completions/mean_terminated_length": 21.45, "completions/min_length": 18.2, "completions/min_terminated_length": 18.2, "epoch": 0.12444994499449945, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7288717043586075, "learning_rate": 4.990981339113465e-06, "loss": 0.0, "num_tokens": 2605530.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1810 }, { "completion_length": 22.7, "completions/clipped_ratio": 0.0, "completions/max_length": 22.7, "completions/max_terminated_length": 22.7, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.12513751375137513, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9725689873099327, "learning_rate": 4.990464942225716e-06, "loss": 0.0, "num_tokens": 2621498.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1820 }, { "completion_length": 25.2, "completions/clipped_ratio": 0.0, "completions/max_length": 25.2, "completions/max_terminated_length": 25.2, "completions/mean_length": 22.3, "completions/mean_terminated_length": 22.3, "completions/min_length": 19.1, "completions/min_terminated_length": 19.1, "epoch": 0.12582508250825084, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8795806443318724, "learning_rate": 4.9899341981418755e-06, "loss": 0.0, "num_tokens": 2637542.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1830 }, { "completion_length": 23.8, "completions/clipped_ratio": 0.0, "completions/max_length": 23.8, "completions/max_terminated_length": 23.8, "completions/mean_length": 20.375, "completions/mean_terminated_length": 20.375, "completions/min_length": 17.7, "completions/min_terminated_length": 17.7, "epoch": 0.1265126512651265, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.735470286756754, "learning_rate": 4.98938910991948e-06, "loss": 0.0, "num_tokens": 2651765.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1840 }, { "completion_length": 28.2, "completions/clipped_ratio": 0.0, "completions/max_length": 28.2, "completions/max_terminated_length": 28.2, "completions/mean_length": 23.175, "completions/mean_terminated_length": 23.175, "completions/min_length": 18.7, "completions/min_terminated_length": 18.7, "epoch": 0.1272002200220022, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 0.7597610068274662, "learning_rate": 4.988829680698702e-06, "loss": 0.0, "num_tokens": 2664136.0, "reward": 4.084999895095825, "reward_std": 0.017320506274700165, "rewards/coherence_reward_func/mean": 1.28499995470047, "rewards/coherence_reward_func/std": 0.017320506274700165, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1850 }, { "completion_length": 21.2, "completions/clipped_ratio": 0.0, "completions/max_length": 21.2, "completions/max_terminated_length": 21.2, "completions/mean_length": 18.15, "completions/mean_terminated_length": 18.15, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.12788778877887788, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8583286347799003, "learning_rate": 4.988255913702329e-06, "loss": 0.0, "num_tokens": 2676534.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1860 }, { "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.5, "completions/max_terminated_length": 22.5, "completions/mean_length": 19.65, "completions/mean_terminated_length": 19.65, "completions/min_length": 16.9, "completions/min_terminated_length": 16.9, "epoch": 0.12857535753575358, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8509030800312758, "learning_rate": 4.987667812235747e-06, "loss": 0.0, "num_tokens": 2693440.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1870 }, { "completion_length": 20.3, "completions/clipped_ratio": 0.0, "completions/max_length": 20.3, "completions/max_terminated_length": 20.3, "completions/mean_length": 18.8, "completions/mean_terminated_length": 18.8, "completions/min_length": 17.5, "completions/min_terminated_length": 17.5, "epoch": 0.12926292629262925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7957369707524776, "learning_rate": 4.98706537968692e-06, "loss": 0.0, "num_tokens": 2707124.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1880 }, { "completion_length": 23.9, "completions/clipped_ratio": 0.0, "completions/max_length": 23.9, "completions/max_terminated_length": 23.9, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 18.5, "completions/min_terminated_length": 18.5, "epoch": 0.12995049504950495, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0164528638124466, "learning_rate": 4.986448619526373e-06, "loss": 0.0, "num_tokens": 2721672.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1890 }, { "completion_length": 23.9, "completions/clipped_ratio": 0.0, "completions/max_length": 23.9, "completions/max_terminated_length": 23.9, "completions/mean_length": 20.225, "completions/mean_terminated_length": 20.225, "completions/min_length": 16.7, "completions/min_terminated_length": 16.7, "epoch": 0.13063806380638063, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8201635698787868, "learning_rate": 4.985817535307168e-06, "loss": 0.0, "num_tokens": 2733841.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1900 }, { "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.5, "completions/max_terminated_length": 25.5, "completions/mean_length": 20.15, "completions/mean_terminated_length": 20.15, "completions/min_length": 16.9, "completions/min_terminated_length": 16.9, "epoch": 0.13132563256325633, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.756796540133655, "learning_rate": 4.9851721306648875e-06, "loss": 0.0, "num_tokens": 2746967.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1910 }, { "completion_length": 21.7, "completions/clipped_ratio": 0.0, "completions/max_length": 21.7, "completions/max_terminated_length": 21.7, "completions/mean_length": 18.325, "completions/mean_terminated_length": 18.325, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.132013201320132, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8643641278147698, "learning_rate": 4.984512409317611e-06, "loss": 0.0, "num_tokens": 2761816.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1920 }, { "completion_length": 20.9, "completions/clipped_ratio": 0.0, "completions/max_length": 20.9, "completions/max_terminated_length": 20.9, "completions/mean_length": 17.45, "completions/mean_terminated_length": 17.45, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.1327007700770077, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9937892898917198, "learning_rate": 4.9838383750658945e-06, "loss": 0.0, "num_tokens": 2778086.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1930 }, { "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.5, "completions/max_terminated_length": 22.5, "completions/mean_length": 19.7, "completions/mean_terminated_length": 19.7, "completions/min_length": 17.1, "completions/min_terminated_length": 17.1, "epoch": 0.1333883388338834, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7064174540340901, "learning_rate": 4.983150031792748e-06, "loss": 0.0, "num_tokens": 2790350.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1940 }, { "completion_length": 17.9, "completions/clipped_ratio": 0.0, "completions/max_length": 17.9, "completions/max_terminated_length": 17.9, "completions/mean_length": 16.475, "completions/mean_terminated_length": 16.475, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.13407590759075907, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8852555111050606, "learning_rate": 4.982447383463615e-06, "loss": 0.0, "num_tokens": 2803853.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1950 }, { "completion_length": 22.2, "completions/clipped_ratio": 0.0, "completions/max_length": 22.2, "completions/max_terminated_length": 22.2, "completions/mean_length": 18.475, "completions/mean_terminated_length": 18.475, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.13476347634763478, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8611627735197545, "learning_rate": 4.981730434126347e-06, "loss": 0.0, "num_tokens": 2818004.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1960 }, { "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.45, "completions/mean_terminated_length": 21.45, "completions/min_length": 17.8, "completions/min_terminated_length": 17.8, "epoch": 0.13545104510451045, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.5601596589200198, "learning_rate": 4.980999187911182e-06, "loss": 0.0, "num_tokens": 2830102.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1970 }, { "completion_length": 21.2, "completions/clipped_ratio": 0.0, "completions/max_length": 21.2, "completions/max_terminated_length": 21.2, "completions/mean_length": 18.275, "completions/mean_terminated_length": 18.275, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.13613861386138615, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8438214469701052, "learning_rate": 4.98025364903072e-06, "loss": 0.0, "num_tokens": 2843833.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 1980 }, { "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.35, "completions/mean_terminated_length": 18.35, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.13682618261826182, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 0.8016061037778854, "learning_rate": 4.979493821779899e-06, "loss": 0.0, "num_tokens": 2858567.0, "reward": 3.9974998950958254, "reward_std": 0.20499999523162843, "rewards/coherence_reward_func/mean": 1.267499953508377, "rewards/coherence_reward_func/std": 0.06499999761581421, "rewards/formatting_reward_func/mean": 1.95, "rewards/formatting_reward_func/std": 0.1, "rewards/quality_reward_func/mean": 0.7800000131130218, "rewards/quality_reward_func/std": 0.04000000059604645, "step": 1990 }, { "completion_length": 22.7, "completions/clipped_ratio": 0.0, "completions/max_length": 22.7, "completions/max_terminated_length": 22.7, "completions/mean_length": 18.85, "completions/mean_terminated_length": 18.85, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.13751375137513752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.912319441139698, "learning_rate": 4.978719710535969e-06, "loss": 0.0, "num_tokens": 2874589.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2000 }, { "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.5, "completions/max_terminated_length": 24.5, "completions/mean_length": 19.275, "completions/mean_terminated_length": 19.275, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.1382013201320132, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7433706654235721, "learning_rate": 4.9779313197584714e-06, "loss": 0.0, "num_tokens": 2889900.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2010 }, { "completion_length": 22.1, "completions/clipped_ratio": 0.0, "completions/max_length": 22.1, "completions/max_terminated_length": 22.1, "completions/mean_length": 18.45, "completions/mean_terminated_length": 18.45, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.1388888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8814809195697307, "learning_rate": 4.977128653989206e-06, "loss": 0.0, "num_tokens": 2905538.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2020 }, { "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.275, "completions/mean_terminated_length": 21.275, "completions/min_length": 18.6, "completions/min_terminated_length": 18.6, "epoch": 0.13957645764576457, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7296697828918696, "learning_rate": 4.976311717852212e-06, "loss": 0.0, "num_tokens": 2919117.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2030 }, { "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.875, "completions/mean_terminated_length": 19.875, "completions/min_length": 17.5, "completions/min_terminated_length": 17.5, "epoch": 0.14026402640264027, "frac_reward_zero_std": 1.0, "grad_norm": 2.5597331841709092e-05, "kl": 0.8254838082939386, "learning_rate": 4.975480516053734e-06, "loss": 0.0, "num_tokens": 2932652.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2040 }, { "completion_length": 22.1, "completions/clipped_ratio": 0.0, "completions/max_length": 22.1, "completions/max_terminated_length": 22.1, "completions/mean_length": 19.15, "completions/mean_terminated_length": 19.15, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.14095159515951594, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8372961468994617, "learning_rate": 4.974635053382203e-06, "loss": 0.0, "num_tokens": 2949058.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2050 }, { "completion_length": 21.8, "completions/clipped_ratio": 0.0, "completions/max_length": 21.8, "completions/max_terminated_length": 21.8, "completions/mean_length": 19.575, "completions/mean_terminated_length": 19.575, "completions/min_length": 17.7, "completions/min_terminated_length": 17.7, "epoch": 0.14163916391639164, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.022470085322857, "learning_rate": 4.973775334708202e-06, "loss": 0.0, "num_tokens": 2963625.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2060 }, { "completion_length": 21.8, "completions/clipped_ratio": 0.0, "completions/max_length": 21.8, "completions/max_terminated_length": 21.8, "completions/mean_length": 19.125, "completions/mean_terminated_length": 19.125, "completions/min_length": 17.1, "completions/min_terminated_length": 17.1, "epoch": 0.14232673267326731, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 0.6930702686309814, "learning_rate": 4.972901364984442e-06, "loss": 0.0, "num_tokens": 2977010.0, "reward": 4.092499876022339, "reward_std": 0.015000002086162567, "rewards/coherence_reward_func/mean": 1.2924999475479126, "rewards/coherence_reward_func/std": 0.01499999761581421, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2070 }, { "completion_length": 22.2, "completions/clipped_ratio": 0.0, "completions/max_length": 22.2, "completions/max_terminated_length": 22.2, "completions/mean_length": 19.375, "completions/mean_terminated_length": 19.375, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.14301430143014301, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9520225204527378, "learning_rate": 4.972013149245731e-06, "loss": 0.0, "num_tokens": 2991037.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2080 }, { "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.15, "completions/mean_terminated_length": 19.15, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.14370187018701872, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7216343142092227, "learning_rate": 4.971110692608949e-06, "loss": 0.0, "num_tokens": 3006499.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2090 }, { "completion_length": 18.4, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.1443894389438944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9042860574787482, "learning_rate": 4.970194000273013e-06, "loss": 0.0, "num_tokens": 3020317.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2100 }, { "completion_length": 18.2, "completions/clipped_ratio": 0.0, "completions/max_length": 18.2, "completions/max_terminated_length": 18.2, "completions/mean_length": 15.175, "completions/mean_terminated_length": 15.175, "completions/min_length": 13.5, "completions/min_terminated_length": 13.5, "epoch": 0.1450770077007701, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0912109043449163, "learning_rate": 4.969263077518849e-06, "loss": 0.0, "num_tokens": 3033824.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2110 }, { "completion_length": 20.2, "completions/clipped_ratio": 0.0, "completions/max_length": 20.2, "completions/max_terminated_length": 20.2, "completions/mean_length": 16.975, "completions/mean_terminated_length": 16.975, "completions/min_length": 14.2, "completions/min_terminated_length": 14.2, "epoch": 0.14576457645764576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7381736582145095, "learning_rate": 4.968317929709366e-06, "loss": 0.0, "num_tokens": 3048847.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2120 }, { "completion_length": 20.1, "completions/clipped_ratio": 0.0, "completions/max_length": 20.1, "completions/max_terminated_length": 20.1, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.14645214521452146, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9189059458673, "learning_rate": 4.967358562289417e-06, "loss": 0.0, "num_tokens": 3062663.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2130 }, { "completion_length": 22.1, "completions/clipped_ratio": 0.0, "completions/max_length": 22.1, "completions/max_terminated_length": 22.1, "completions/mean_length": 18.425, "completions/mean_terminated_length": 18.425, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.14713971397139713, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7546185554936529, "learning_rate": 4.966384980785777e-06, "loss": 0.0, "num_tokens": 3076788.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2140 }, { "completion_length": 21.6, "completions/clipped_ratio": 0.0, "completions/max_length": 21.6, "completions/max_terminated_length": 21.6, "completions/mean_length": 18.45, "completions/mean_terminated_length": 18.45, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.14782728272827284, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7378853656351566, "learning_rate": 4.9653971908071005e-06, "loss": 0.0, "num_tokens": 3090498.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2150 }, { "completion_length": 18.6, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 15.625, "completions/mean_terminated_length": 15.625, "completions/min_length": 13.6, "completions/min_terminated_length": 13.6, "epoch": 0.1485148514851485, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.072531382739544, "learning_rate": 4.964395198043898e-06, "loss": 0.0, "num_tokens": 3103227.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2160 }, { "completion_length": 21.2, "completions/clipped_ratio": 0.0, "completions/max_length": 21.2, "completions/max_terminated_length": 21.2, "completions/mean_length": 17.8, "completions/mean_terminated_length": 17.8, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.1492024202420242, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8197686405852437, "learning_rate": 4.963379008268503e-06, "loss": 0.0, "num_tokens": 3114919.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2170 }, { "completion_length": 19.9, "completions/clipped_ratio": 0.0, "completions/max_length": 19.9, "completions/max_terminated_length": 19.9, "completions/mean_length": 18.425, "completions/mean_terminated_length": 18.425, "completions/min_length": 16.9, "completions/min_terminated_length": 16.9, "epoch": 0.14988998899889988, "frac_reward_zero_std": 1.0, "grad_norm": 0.00019916222663596272, "kl": 0.733295165374875, "learning_rate": 4.9623486273350306e-06, "loss": 0.0, "num_tokens": 3130588.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2180 }, { "completion_length": 22.4, "completions/clipped_ratio": 0.0, "completions/max_length": 22.4, "completions/max_terminated_length": 22.4, "completions/mean_length": 18.8, "completions/mean_terminated_length": 18.8, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.15057755775577558, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8483817830681801, "learning_rate": 4.961304061179352e-06, "loss": 0.0, "num_tokens": 3144032.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2190 }, { "completion_length": 19.8, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 18.075, "completions/mean_terminated_length": 18.075, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.15126512651265125, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9663286544382572, "learning_rate": 4.960245315819055e-06, "loss": 0.0, "num_tokens": 3158843.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2200 }, { "completion_length": 19.3, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 17.325, "completions/mean_terminated_length": 17.325, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.15195269526952696, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7434234840329736, "learning_rate": 4.959172397353416e-06, "loss": 0.0, "num_tokens": 3170764.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2210 }, { "completion_length": 20.9, "completions/clipped_ratio": 0.0, "completions/max_length": 20.9, "completions/max_terminated_length": 20.9, "completions/mean_length": 17.075, "completions/mean_terminated_length": 17.075, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.15264026402640263, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7192586746066809, "learning_rate": 4.958085311963355e-06, "loss": 0.0, "num_tokens": 3186659.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2220 }, { "completion_length": 20.1, "completions/clipped_ratio": 0.0, "completions/max_length": 20.1, "completions/max_terminated_length": 20.1, "completions/mean_length": 16.8, "completions/mean_terminated_length": 16.8, "completions/min_length": 14.3, "completions/min_terminated_length": 14.3, "epoch": 0.15332783278327833, "frac_reward_zero_std": 1.0, "grad_norm": 0.00017744266369845718, "kl": 0.8834232156863436, "learning_rate": 4.95698406591141e-06, "loss": 0.0, "num_tokens": 3201223.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2230 }, { "completion_length": 21.6, "completions/clipped_ratio": 0.0, "completions/max_length": 21.6, "completions/max_terminated_length": 21.6, "completions/mean_length": 17.975, "completions/mean_terminated_length": 17.975, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.15401540154015403, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.826015540631488, "learning_rate": 4.955868665541694e-06, "loss": 0.0, "num_tokens": 3214418.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2240 }, { "completion_length": 18.9, "completions/clipped_ratio": 0.0, "completions/max_length": 18.9, "completions/max_terminated_length": 18.9, "completions/mean_length": 16.625, "completions/mean_terminated_length": 16.625, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.1547029702970297, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1379321828484534, "learning_rate": 4.954739117279863e-06, "loss": 0.0, "num_tokens": 3229451.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2250 }, { "completion_length": 22.9, "completions/clipped_ratio": 0.0, "completions/max_length": 22.9, "completions/max_terminated_length": 22.9, "completions/mean_length": 19.525, "completions/mean_terminated_length": 19.525, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.1553905390539054, "frac_reward_zero_std": 1.0, "grad_norm": 0.00023796246387064457, "kl": 0.770618736371398, "learning_rate": 4.953595427633075e-06, "loss": 0.0, "num_tokens": 3245200.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2260 }, { "completion_length": 20.8, "completions/clipped_ratio": 0.0, "completions/max_length": 20.8, "completions/max_terminated_length": 20.8, "completions/mean_length": 17.55, "completions/mean_terminated_length": 17.55, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.15607810781078107, "frac_reward_zero_std": 0.9, "grad_norm": 0.2128799855709076, "kl": 0.9899251884780824, "learning_rate": 4.952437603189954e-06, "loss": 0.0, "num_tokens": 3257986.0, "reward": 4.077499914169311, "reward_std": 0.01499999314546585, "rewards/coherence_reward_func/mean": 1.2774999618530274, "rewards/coherence_reward_func/std": 0.01499999761581421, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2270 }, { "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.5, "completions/max_terminated_length": 21.5, "completions/mean_length": 18.8, "completions/mean_terminated_length": 18.8, "completions/min_length": 16.7, "completions/min_terminated_length": 16.7, "epoch": 0.15676567656765678, "frac_reward_zero_std": 1.0, "grad_norm": 2.153699824702926e-05, "kl": 0.639985965937376, "learning_rate": 4.951265650620555e-06, "loss": 0.0, "num_tokens": 3271286.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2280 }, { "completion_length": 23.7, "completions/clipped_ratio": 0.0, "completions/max_length": 23.7, "completions/max_terminated_length": 23.7, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.15745324532453245, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 0.8417373207397759, "learning_rate": 4.950079576676321e-06, "loss": 0.0, "num_tokens": 3286704.0, "reward": 4.092499876022339, "reward_std": 0.015000002086162567, "rewards/coherence_reward_func/mean": 1.2924999475479126, "rewards/coherence_reward_func/std": 0.01499999761581421, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2290 }, { "completion_length": 20.1, "completions/clipped_ratio": 0.0, "completions/max_length": 20.1, "completions/max_terminated_length": 20.1, "completions/mean_length": 17.525, "completions/mean_terminated_length": 17.525, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.15814081408140815, "frac_reward_zero_std": 1.0, "grad_norm": 7.601361721754074e-05, "kl": 0.9805225431919098, "learning_rate": 4.948879388190047e-06, "loss": 0.0, "num_tokens": 3300665.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2300 }, { "completion_length": 19.8, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 17.575, "completions/mean_terminated_length": 17.575, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.15882838283828382, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.6460956549737602, "learning_rate": 4.9476650920758375e-06, "loss": 0.0, "num_tokens": 3311856.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2310 }, { "completion_length": 22.7, "completions/clipped_ratio": 0.0, "completions/max_length": 22.7, "completions/max_terminated_length": 22.7, "completions/mean_length": 18.875, "completions/mean_terminated_length": 18.875, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.15951595159515952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7565588610246777, "learning_rate": 4.946436695329072e-06, "loss": 0.0, "num_tokens": 3327371.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2320 }, { "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.375, "completions/mean_terminated_length": 18.375, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.1602035203520352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8117489916272461, "learning_rate": 4.945194205026361e-06, "loss": 0.0, "num_tokens": 3340534.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2330 }, { "completion_length": 20.9, "completions/clipped_ratio": 0.0, "completions/max_length": 20.9, "completions/max_terminated_length": 20.9, "completions/mean_length": 18.1, "completions/mean_terminated_length": 18.1, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.1608910891089109, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8587976403534412, "learning_rate": 4.943937628325506e-06, "loss": 0.0, "num_tokens": 3354778.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2340 }, { "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.625, "completions/mean_terminated_length": 17.625, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.16157865786578657, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9266062453389168, "learning_rate": 4.942666972465455e-06, "loss": 0.0, "num_tokens": 3370723.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2350 }, { "completion_length": 22.6, "completions/clipped_ratio": 0.0, "completions/max_length": 22.6, "completions/max_terminated_length": 22.6, "completions/mean_length": 17.95, "completions/mean_terminated_length": 17.95, "completions/min_length": 13.7, "completions/min_terminated_length": 13.7, "epoch": 0.16226622662266227, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 1.0408529471606016, "learning_rate": 4.9413822447662686e-06, "loss": 0.0, "num_tokens": 3383301.0, "reward": 4.087499904632568, "reward_std": 0.025, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.787500011920929, "rewards/quality_reward_func/std": 0.025, "step": 2360 }, { "completion_length": 19.7, "completions/clipped_ratio": 0.0, "completions/max_length": 19.7, "completions/max_terminated_length": 19.7, "completions/mean_length": 17.675, "completions/mean_terminated_length": 17.675, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.16295379537953794, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 0.8424043988808989, "learning_rate": 4.940083452629069e-06, "loss": 0.0, "num_tokens": 3395652.0, "reward": 4.092499876022339, "reward_std": 0.015000002086162567, "rewards/coherence_reward_func/mean": 1.2924999475479126, "rewards/coherence_reward_func/std": 0.01499999761581421, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2370 }, { "completion_length": 24.7, "completions/clipped_ratio": 0.0, "completions/max_length": 24.7, "completions/max_terminated_length": 24.7, "completions/mean_length": 21.075, "completions/mean_terminated_length": 21.075, "completions/min_length": 17.7, "completions/min_terminated_length": 17.7, "epoch": 0.16364136413641364, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.029233940318227, "learning_rate": 4.938770603536006e-06, "loss": 0.0, "num_tokens": 3410415.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2380 }, { "completion_length": 21.8, "completions/clipped_ratio": 0.0, "completions/max_length": 21.8, "completions/max_terminated_length": 21.8, "completions/mean_length": 19.775, "completions/mean_terminated_length": 19.775, "completions/min_length": 17.2, "completions/min_terminated_length": 17.2, "epoch": 0.16432893289328934, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8418177347630262, "learning_rate": 4.937443705050205e-06, "loss": 0.0, "num_tokens": 3425198.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2390 }, { "completion_length": 26.2, "completions/clipped_ratio": 0.0, "completions/max_length": 26.2, "completions/max_terminated_length": 26.2, "completions/mean_length": 21.775, "completions/mean_terminated_length": 21.775, "completions/min_length": 17.3, "completions/min_terminated_length": 17.3, "epoch": 0.16501650165016502, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8315188098698855, "learning_rate": 4.93610276481573e-06, "loss": 0.0, "num_tokens": 3440121.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2400 }, { "completion_length": 24.857142857142858, "completions/clipped_ratio": 0.0, "completions/max_length": 24.857142857142858, "completions/max_terminated_length": 24.857142857142858, "completions/mean_length": 20.875, "completions/mean_terminated_length": 20.875, "completions/min_length": 16.785714285714285, "completions/min_terminated_length": 16.785714285714285, "epoch": 0.16570407040704072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7961281064365592, "learning_rate": 4.934747790557538e-06, "loss": 0.0, "num_tokens": 3454319.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2410 }, { "completion_length": 24.3, "completions/clipped_ratio": 0.0, "completions/max_length": 24.3, "completions/max_terminated_length": 24.3, "completions/mean_length": 21.175, "completions/mean_terminated_length": 21.175, "completions/min_length": 17.8, "completions/min_terminated_length": 17.8, "epoch": 0.1663916391639164, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9886185888200998, "learning_rate": 4.933378790081431e-06, "loss": 0.0, "num_tokens": 3468158.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2420 }, { "completion_length": 25.3, "completions/clipped_ratio": 0.0, "completions/max_length": 25.3, "completions/max_terminated_length": 25.3, "completions/mean_length": 21.225, "completions/mean_terminated_length": 21.225, "completions/min_length": 18.4, "completions/min_terminated_length": 18.4, "epoch": 0.1670792079207921, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.97828159481287, "learning_rate": 4.931995771274019e-06, "loss": 0.0, "num_tokens": 3482027.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2430 }, { "completion_length": 22.7, "completions/clipped_ratio": 0.0, "completions/max_length": 22.7, "completions/max_terminated_length": 22.7, "completions/mean_length": 19.55, "completions/mean_terminated_length": 19.55, "completions/min_length": 17.4, "completions/min_terminated_length": 17.4, "epoch": 0.16776677667766776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9910362638533116, "learning_rate": 4.930598742102664e-06, "loss": 0.0, "num_tokens": 3496813.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2440 }, { "completion_length": 20.1, "completions/clipped_ratio": 0.0, "completions/max_length": 20.1, "completions/max_terminated_length": 20.1, "completions/mean_length": 17.85, "completions/mean_terminated_length": 17.85, "completions/min_length": 16.7, "completions/min_terminated_length": 16.7, "epoch": 0.16845434543454346, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1091920994222164, "learning_rate": 4.929187710615444e-06, "loss": 0.0, "num_tokens": 3510907.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2450 }, { "completion_length": 24.4, "completions/clipped_ratio": 0.0, "completions/max_length": 24.4, "completions/max_terminated_length": 24.4, "completions/mean_length": 21.9, "completions/mean_terminated_length": 21.9, "completions/min_length": 19.5, "completions/min_terminated_length": 19.5, "epoch": 0.16914191419141913, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.084050140529871, "learning_rate": 4.927762684941099e-06, "loss": 0.0, "num_tokens": 3525587.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2460 }, { "completion_length": 22.1, "completions/clipped_ratio": 0.0, "completions/max_length": 22.1, "completions/max_terminated_length": 22.1, "completions/mean_length": 19.425, "completions/mean_terminated_length": 19.425, "completions/min_length": 16.9, "completions/min_terminated_length": 16.9, "epoch": 0.16982948294829484, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9878407068550586, "learning_rate": 4.926323673288989e-06, "loss": 0.0, "num_tokens": 3541292.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2470 }, { "completion_length": 22.2, "completions/clipped_ratio": 0.0, "completions/max_length": 22.2, "completions/max_terminated_length": 22.2, "completions/mean_length": 20.125, "completions/mean_terminated_length": 20.125, "completions/min_length": 17.5, "completions/min_terminated_length": 17.5, "epoch": 0.1705170517051705, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8478510402143001, "learning_rate": 4.924870683949046e-06, "loss": 0.0, "num_tokens": 3556005.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2480 }, { "completion_length": 26.1, "completions/clipped_ratio": 0.0, "completions/max_length": 26.1, "completions/max_terminated_length": 26.1, "completions/mean_length": 22.7, "completions/mean_terminated_length": 22.7, "completions/min_length": 19.9, "completions/min_terminated_length": 19.9, "epoch": 0.1712046204620462, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 0.7569789554923773, "learning_rate": 4.923403725291723e-06, "loss": 0.0, "num_tokens": 3568433.0, "reward": 4.084999895095825, "reward_std": 0.017320506274700165, "rewards/coherence_reward_func/mean": 1.28499995470047, "rewards/coherence_reward_func/std": 0.017320506274700165, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2490 }, { "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.5, "completions/max_terminated_length": 25.5, "completions/mean_length": 22.675, "completions/mean_terminated_length": 22.675, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.17189218921892188, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7698194235563278, "learning_rate": 4.9219228057679504e-06, "loss": 0.0, "num_tokens": 3583992.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2500 }, { "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.175, "completions/mean_terminated_length": 20.175, "completions/min_length": 17.7, "completions/min_terminated_length": 17.7, "epoch": 0.17257975797579758, "frac_reward_zero_std": 0.9, "grad_norm": 0.19882693886756897, "kl": 0.9453626565635205, "learning_rate": 4.920427933909084e-06, "loss": 0.0, "num_tokens": 3597235.0, "reward": 4.087499904632568, "reward_std": 0.025, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.787500011920929, "rewards/quality_reward_func/std": 0.025, "step": 2510 }, { "completion_length": 19.9, "completions/clipped_ratio": 0.0, "completions/max_length": 19.9, "completions/max_terminated_length": 19.9, "completions/mean_length": 18.125, "completions/mean_terminated_length": 18.125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.17326732673267325, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9986454591155052, "learning_rate": 4.918919118326856e-06, "loss": 0.0, "num_tokens": 3610160.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2520 }, { "completion_length": 23.2, "completions/clipped_ratio": 0.0, "completions/max_length": 23.2, "completions/max_terminated_length": 23.2, "completions/mean_length": 20.475, "completions/mean_terminated_length": 20.475, "completions/min_length": 18.5, "completions/min_terminated_length": 18.5, "epoch": 0.17395489548954896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.31350201331079, "learning_rate": 4.9173963677133286e-06, "loss": 0.0001, "num_tokens": 3623119.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2530 }, { "completion_length": 17.9, "completions/clipped_ratio": 0.0, "completions/max_length": 17.9, "completions/max_terminated_length": 17.9, "completions/mean_length": 16.975, "completions/mean_terminated_length": 16.975, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.17464246424642466, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.455815714597702, "learning_rate": 4.915859690840839e-06, "loss": 0.0, "num_tokens": 3636802.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2540 }, { "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.2, "completions/mean_terminated_length": 20.2, "completions/min_length": 17.3, "completions/min_terminated_length": 17.3, "epoch": 0.17533003300330033, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0864813446998596, "learning_rate": 4.914309096561954e-06, "loss": 0.0, "num_tokens": 3651746.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2550 }, { "completion_length": 19.1, "completions/clipped_ratio": 0.0, "completions/max_length": 19.1, "completions/max_terminated_length": 19.1, "completions/mean_length": 17.65, "completions/mean_terminated_length": 17.65, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.17601760176017603, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0578282084316015, "learning_rate": 4.912744593809415e-06, "loss": 0.0, "num_tokens": 3667952.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2560 }, { "completion_length": 20.6, "completions/clipped_ratio": 0.0, "completions/max_length": 20.6, "completions/max_terminated_length": 20.6, "completions/mean_length": 18.2, "completions/mean_terminated_length": 18.2, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.1767051705170517, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.158187449723482, "learning_rate": 4.911166191596089e-06, "loss": 0.0, "num_tokens": 3682512.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2570 }, { "completion_length": 18.8, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 16.575, "completions/mean_terminated_length": 16.575, "completions/min_length": 14.6, "completions/min_terminated_length": 14.6, "epoch": 0.1773927392739274, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3145602520555257, "learning_rate": 4.909573899014914e-06, "loss": 0.0, "num_tokens": 3696591.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2580 }, { "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 17.6, "completions/mean_terminated_length": 17.6, "completions/min_length": 16.9, "completions/min_terminated_length": 16.9, "epoch": 0.17808030803080308, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.5148358970880509, "learning_rate": 4.9079677252388506e-06, "loss": 0.0001, "num_tokens": 3713051.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2590 }, { "completion_length": 20.7, "completions/clipped_ratio": 0.0, "completions/max_length": 20.7, "completions/max_terminated_length": 20.7, "completions/mean_length": 18.775, "completions/mean_terminated_length": 18.775, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.17876787678767878, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.311620868742466, "learning_rate": 4.906347679520824e-06, "loss": 0.0, "num_tokens": 3729834.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2600 }, { "completion_length": 21.4, "completions/clipped_ratio": 0.0, "completions/max_length": 21.4, "completions/max_terminated_length": 21.4, "completions/mean_length": 19.175, "completions/mean_terminated_length": 19.175, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.17945544554455445, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2974657468497752, "learning_rate": 4.904713771193677e-06, "loss": 0.0001, "num_tokens": 3745081.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2610 }, { "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.1, "completions/min_terminated_length": 17.1, "epoch": 0.18014301430143015, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1950997911393642, "learning_rate": 4.903066009670111e-06, "loss": 0.0, "num_tokens": 3761921.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2620 }, { "completion_length": 22.2, "completions/clipped_ratio": 0.0, "completions/max_length": 22.2, "completions/max_terminated_length": 22.2, "completions/mean_length": 19.475, "completions/mean_terminated_length": 19.475, "completions/min_length": 17.2, "completions/min_terminated_length": 17.2, "epoch": 0.18083058305830582, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2528184674680234, "learning_rate": 4.901404404442633e-06, "loss": 0.0001, "num_tokens": 3776080.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2630 }, { "completion_length": 19.3, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 17.075, "completions/mean_terminated_length": 17.075, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.18151815181518152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0893051519989967, "learning_rate": 4.899728965083502e-06, "loss": 0.0, "num_tokens": 3790027.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2640 }, { "completion_length": 22.2, "completions/clipped_ratio": 0.0, "completions/max_length": 22.2, "completions/max_terminated_length": 22.2, "completions/mean_length": 19.575, "completions/mean_terminated_length": 19.575, "completions/min_length": 16.8, "completions/min_terminated_length": 16.8, "epoch": 0.1822057205720572, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0252834126353263, "learning_rate": 4.898039701244674e-06, "loss": 0.0, "num_tokens": 3805350.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2650 }, { "completion_length": 22.1, "completions/clipped_ratio": 0.0, "completions/max_length": 22.1, "completions/max_terminated_length": 22.1, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.1828932893289329, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9304249338107183, "learning_rate": 4.896336622657747e-06, "loss": 0.0, "num_tokens": 3817118.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2660 }, { "completion_length": 16.8, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 15.7, "completions/mean_terminated_length": 15.7, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.18358085808580857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2188059924170376, "learning_rate": 4.894619739133899e-06, "loss": 0.0, "num_tokens": 3830254.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2670 }, { "completion_length": 21.6, "completions/clipped_ratio": 0.0, "completions/max_length": 21.6, "completions/max_terminated_length": 21.6, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 16.7, "completions/min_terminated_length": 16.7, "epoch": 0.18426842684268427, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1871018297970295, "learning_rate": 4.892889060563841e-06, "loss": 0.0, "num_tokens": 3844148.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2680 }, { "completion_length": 21.9, "completions/clipped_ratio": 0.0, "completions/max_length": 21.9, "completions/max_terminated_length": 21.9, "completions/mean_length": 19.325, "completions/mean_terminated_length": 19.325, "completions/min_length": 17.2, "completions/min_terminated_length": 17.2, "epoch": 0.18495599559955997, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0766710020601749, "learning_rate": 4.891144596917753e-06, "loss": 0.0, "num_tokens": 3860593.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2690 }, { "completion_length": 22.1, "completions/clipped_ratio": 0.0, "completions/max_length": 22.1, "completions/max_terminated_length": 22.1, "completions/mean_length": 19.375, "completions/mean_terminated_length": 19.375, "completions/min_length": 17.2, "completions/min_terminated_length": 17.2, "epoch": 0.18564356435643564, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0673514403402806, "learning_rate": 4.88938635824523e-06, "loss": 0.0, "num_tokens": 3874672.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2700 }, { "completion_length": 21.7, "completions/clipped_ratio": 0.0, "completions/max_length": 21.7, "completions/max_terminated_length": 21.7, "completions/mean_length": 19.675, "completions/mean_terminated_length": 19.675, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.18633113311331134, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0085795551538468, "learning_rate": 4.88761435467522e-06, "loss": 0.0, "num_tokens": 3889095.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2710 }, { "completion_length": 20.6, "completions/clipped_ratio": 0.0, "completions/max_length": 20.6, "completions/max_terminated_length": 20.6, "completions/mean_length": 18.65, "completions/mean_terminated_length": 18.65, "completions/min_length": 17.6, "completions/min_terminated_length": 17.6, "epoch": 0.18701870187018702, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9224094368517399, "learning_rate": 4.885828596415973e-06, "loss": 0.0, "num_tokens": 3902869.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2720 }, { "completion_length": 22.8, "completions/clipped_ratio": 0.0, "completions/max_length": 22.8, "completions/max_terminated_length": 22.8, "completions/mean_length": 20.05, "completions/mean_terminated_length": 20.05, "completions/min_length": 17.6, "completions/min_terminated_length": 17.6, "epoch": 0.18770627062706272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2098381072282791, "learning_rate": 4.884029093754974e-06, "loss": 0.0, "num_tokens": 3915675.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2730 }, { "completion_length": 20.4, "completions/clipped_ratio": 0.0, "completions/max_length": 20.4, "completions/max_terminated_length": 20.4, "completions/mean_length": 18.4, "completions/mean_terminated_length": 18.4, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.1883938393839384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2505564127117395, "learning_rate": 4.882215857058888e-06, "loss": 0.0, "num_tokens": 3929907.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2740 }, { "completion_length": 20.8, "completions/clipped_ratio": 0.0, "completions/max_length": 20.8, "completions/max_terminated_length": 20.8, "completions/mean_length": 17.975, "completions/mean_terminated_length": 17.975, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.1890814081408141, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9567908208817244, "learning_rate": 4.880388896773503e-06, "loss": 0.0, "num_tokens": 3945066.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2750 }, { "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 18.675, "completions/mean_terminated_length": 18.675, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.18976897689768976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3238054752349853, "learning_rate": 4.878548223423661e-06, "loss": 0.0001, "num_tokens": 3958953.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2760 }, { "completion_length": 20.4, "completions/clipped_ratio": 0.0, "completions/max_length": 20.4, "completions/max_terminated_length": 20.4, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.19045654565456546, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.25826550796628, "learning_rate": 4.876693847613208e-06, "loss": 0.0, "num_tokens": 3971299.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2770 }, { "completion_length": 22.3, "completions/clipped_ratio": 0.0, "completions/max_length": 22.3, "completions/max_terminated_length": 22.3, "completions/mean_length": 19.875, "completions/mean_terminated_length": 19.875, "completions/min_length": 17.9, "completions/min_terminated_length": 17.9, "epoch": 0.19114411441144114, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2012567497789859, "learning_rate": 4.874825780024926e-06, "loss": 0.0001, "num_tokens": 3984006.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2780 }, { "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.475, "completions/mean_terminated_length": 21.475, "completions/min_length": 18.4, "completions/min_terminated_length": 18.4, "epoch": 0.19183168316831684, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9388201177120209, "learning_rate": 4.872944031420471e-06, "loss": 0.0, "num_tokens": 3997725.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2790 }, { "completion_length": 21.4, "completions/clipped_ratio": 0.0, "completions/max_length": 21.4, "completions/max_terminated_length": 21.4, "completions/mean_length": 19.05, "completions/mean_terminated_length": 19.05, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.1925192519251925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1699418079108, "learning_rate": 4.871048612640316e-06, "loss": 0.0, "num_tokens": 4011735.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2800 }, { "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.5, "completions/max_terminated_length": 21.5, "completions/mean_length": 19.425, "completions/mean_terminated_length": 19.425, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.1932068206820682, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0788509037345648, "learning_rate": 4.869139534603685e-06, "loss": 0.0, "num_tokens": 4023984.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2810 }, { "completion_length": 20.3, "completions/clipped_ratio": 0.0, "completions/max_length": 20.3, "completions/max_terminated_length": 20.3, "completions/mean_length": 18.725, "completions/mean_terminated_length": 18.725, "completions/min_length": 16.8, "completions/min_terminated_length": 16.8, "epoch": 0.19389438943894388, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2839029371738433, "learning_rate": 4.8672168083084925e-06, "loss": 0.0, "num_tokens": 4036573.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2820 }, { "completion_length": 22.4, "completions/clipped_ratio": 0.0, "completions/max_length": 22.4, "completions/max_terminated_length": 22.4, "completions/mean_length": 20.125, "completions/mean_terminated_length": 20.125, "completions/min_length": 17.6, "completions/min_terminated_length": 17.6, "epoch": 0.19458195819581958, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9523771129548549, "learning_rate": 4.865280444831276e-06, "loss": 0.0, "num_tokens": 4050718.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2830 }, { "completion_length": 21.8, "completions/clipped_ratio": 0.0, "completions/max_length": 21.8, "completions/max_terminated_length": 21.8, "completions/mean_length": 19.9, "completions/mean_terminated_length": 19.9, "completions/min_length": 18.4, "completions/min_terminated_length": 18.4, "epoch": 0.19526952695269528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.23244196921587, "learning_rate": 4.8633304553271365e-06, "loss": 0.0, "num_tokens": 4064270.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2840 }, { "completion_length": 22.1, "completions/clipped_ratio": 0.0, "completions/max_length": 22.1, "completions/max_terminated_length": 22.1, "completions/mean_length": 19.675, "completions/mean_terminated_length": 19.675, "completions/min_length": 17.6, "completions/min_terminated_length": 17.6, "epoch": 0.19595709570957096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0265448220074176, "learning_rate": 4.861366851029671e-06, "loss": 0.0, "num_tokens": 4075757.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2850 }, { "completion_length": 19.9, "completions/clipped_ratio": 0.0, "completions/max_length": 19.9, "completions/max_terminated_length": 19.9, "completions/mean_length": 18.375, "completions/mean_terminated_length": 18.375, "completions/min_length": 16.8, "completions/min_terminated_length": 16.8, "epoch": 0.19664466446644666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.093635232001543, "learning_rate": 4.85938964325091e-06, "loss": 0.0, "num_tokens": 4090376.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2860 }, { "completion_length": 21.1, "completions/clipped_ratio": 0.0, "completions/max_length": 21.1, "completions/max_terminated_length": 21.1, "completions/mean_length": 18.475, "completions/mean_terminated_length": 18.475, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.19733223322332233, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3355726785957813, "learning_rate": 4.857398843381253e-06, "loss": 0.0001, "num_tokens": 4105523.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2870 }, { "completion_length": 22.6, "completions/clipped_ratio": 0.0, "completions/max_length": 22.6, "completions/max_terminated_length": 22.6, "completions/mean_length": 19.125, "completions/mean_terminated_length": 19.125, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.19801980198019803, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.827095165103674, "learning_rate": 4.855394462889401e-06, "loss": 0.0, "num_tokens": 4118800.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2880 }, { "completion_length": 21.1, "completions/clipped_ratio": 0.0, "completions/max_length": 21.1, "completions/max_terminated_length": 21.1, "completions/mean_length": 18.45, "completions/mean_terminated_length": 18.45, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.1987073707370737, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.108222411572933, "learning_rate": 4.853376513322289e-06, "loss": 0.0, "num_tokens": 4132162.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2890 }, { "completion_length": 21.8, "completions/clipped_ratio": 0.0, "completions/max_length": 21.8, "completions/max_terminated_length": 21.8, "completions/mean_length": 20.1, "completions/mean_terminated_length": 20.1, "completions/min_length": 18.1, "completions/min_terminated_length": 18.1, "epoch": 0.1993949394939494, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2065526448190211, "learning_rate": 4.851345006305021e-06, "loss": 0.0, "num_tokens": 4147586.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2900 }, { "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 18.05, "completions/mean_terminated_length": 18.05, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.20008250825082508, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2191161870956422, "learning_rate": 4.849299953540809e-06, "loss": 0.0001, "num_tokens": 4164124.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2910 }, { "completion_length": 23.1, "completions/clipped_ratio": 0.0, "completions/max_length": 23.1, "completions/max_terminated_length": 23.1, "completions/mean_length": 20.875, "completions/mean_terminated_length": 20.875, "completions/min_length": 18.6, "completions/min_terminated_length": 18.6, "epoch": 0.20077007700770078, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2381049145013094, "learning_rate": 4.847241366810893e-06, "loss": 0.0, "num_tokens": 4176371.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2920 }, { "completion_length": 19.9, "completions/clipped_ratio": 0.0, "completions/max_length": 19.9, "completions/max_terminated_length": 19.9, "completions/mean_length": 17.925, "completions/mean_terminated_length": 17.925, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.20145764576457645, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.5103369668126105, "learning_rate": 4.845169257974485e-06, "loss": 0.0001, "num_tokens": 4192044.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2930 }, { "completion_length": 19.2, "completions/clipped_ratio": 0.0, "completions/max_length": 19.2, "completions/max_terminated_length": 19.2, "completions/mean_length": 16.9, "completions/mean_terminated_length": 16.9, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.20214521452145215, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.260981298983097, "learning_rate": 4.843083638968693e-06, "loss": 0.0, "num_tokens": 4205860.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2940 }, { "completion_length": 19.2, "completions/clipped_ratio": 0.0, "completions/max_length": 19.2, "completions/max_terminated_length": 19.2, "completions/mean_length": 17.775, "completions/mean_terminated_length": 17.775, "completions/min_length": 16.9, "completions/min_terminated_length": 16.9, "epoch": 0.20283278327832782, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1735950447618961, "learning_rate": 4.8409845218084565e-06, "loss": 0.0, "num_tokens": 4219419.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2950 }, { "completion_length": 23.7, "completions/clipped_ratio": 0.0, "completions/max_length": 23.7, "completions/max_terminated_length": 23.7, "completions/mean_length": 20.875, "completions/mean_terminated_length": 20.875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.20352035203520352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8377597516402602, "learning_rate": 4.838871918586477e-06, "loss": 0.0, "num_tokens": 4233278.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2960 }, { "completion_length": 21.8, "completions/clipped_ratio": 0.0, "completions/max_length": 21.8, "completions/max_terminated_length": 21.8, "completions/mean_length": 19.8, "completions/mean_terminated_length": 19.8, "completions/min_length": 18.1, "completions/min_terminated_length": 18.1, "epoch": 0.2042079207920792, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0994380608201026, "learning_rate": 4.836745841473143e-06, "loss": 0.0, "num_tokens": 4248062.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2970 }, { "completion_length": 22.1, "completions/clipped_ratio": 0.0, "completions/max_length": 22.1, "completions/max_terminated_length": 22.1, "completions/mean_length": 19.1, "completions/mean_terminated_length": 19.1, "completions/min_length": 17.1, "completions/min_terminated_length": 17.1, "epoch": 0.2048954895489549, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9462737578898668, "learning_rate": 4.834606302716468e-06, "loss": 0.0, "num_tokens": 4263350.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2980 }, { "completion_length": 20.3, "completions/clipped_ratio": 0.0, "completions/max_length": 20.3, "completions/max_terminated_length": 20.3, "completions/mean_length": 18.1, "completions/mean_terminated_length": 18.1, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.2055830583058306, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.181640937179327, "learning_rate": 4.832453314642016e-06, "loss": 0.0, "num_tokens": 4277758.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 2990 }, { "completion_length": 23.8, "completions/clipped_ratio": 0.0, "completions/max_length": 23.8, "completions/max_terminated_length": 23.8, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.7, "completions/min_terminated_length": 17.7, "epoch": 0.20627062706270627, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.07782400008291, "learning_rate": 4.830286889652829e-06, "loss": 0.0, "num_tokens": 4291536.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3000 }, { "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.8, "completions/mean_terminated_length": 17.8, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.20695819581958197, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0386630140244961, "learning_rate": 4.828107040229356e-06, "loss": 0.0, "num_tokens": 4307800.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3010 }, { "completion_length": 21.6, "completions/clipped_ratio": 0.0, "completions/max_length": 21.6, "completions/max_terminated_length": 21.6, "completions/mean_length": 19.3, "completions/mean_terminated_length": 19.3, "completions/min_length": 17.5, "completions/min_terminated_length": 17.5, "epoch": 0.20764576457645764, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.036546140164137, "learning_rate": 4.8259137789293845e-06, "loss": 0.0, "num_tokens": 4320712.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3020 }, { "completion_length": 19.7, "completions/clipped_ratio": 0.0, "completions/max_length": 19.7, "completions/max_terminated_length": 19.7, "completions/mean_length": 18.15, "completions/mean_terminated_length": 18.15, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.20833333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1517936304211616, "learning_rate": 4.823707118387965e-06, "loss": 0.0, "num_tokens": 4335202.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3030 }, { "completion_length": 21.6, "completions/clipped_ratio": 0.0, "completions/max_length": 21.6, "completions/max_terminated_length": 21.6, "completions/mean_length": 18.35, "completions/mean_terminated_length": 18.35, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.20902090209020902, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2074196621775628, "learning_rate": 4.821487071317338e-06, "loss": 0.0, "num_tokens": 4350012.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3040 }, { "completion_length": 23.3, "completions/clipped_ratio": 0.0, "completions/max_length": 23.3, "completions/max_terminated_length": 23.3, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.20970847084708472, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2577481478452683, "learning_rate": 4.8192536505068645e-06, "loss": 0.0001, "num_tokens": 4363380.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3050 }, { "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.5, "completions/max_terminated_length": 22.5, "completions/mean_length": 20.125, "completions/mean_terminated_length": 20.125, "completions/min_length": 17.4, "completions/min_terminated_length": 17.4, "epoch": 0.2103960396039604, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.006920213997364, "learning_rate": 4.817006868822946e-06, "loss": 0.0, "num_tokens": 4378297.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3060 }, { "completion_length": 21.9, "completions/clipped_ratio": 0.0, "completions/max_length": 21.9, "completions/max_terminated_length": 21.9, "completions/mean_length": 19.9, "completions/mean_terminated_length": 19.9, "completions/min_length": 16.9, "completions/min_terminated_length": 16.9, "epoch": 0.2110836083608361, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1219864902086556, "learning_rate": 4.8147467392089555e-06, "loss": 0.0, "num_tokens": 4390625.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3070 }, { "completion_length": 17.4, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.21177117711771176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0295380629599094, "learning_rate": 4.812473274685163e-06, "loss": 0.0, "num_tokens": 4402781.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3080 }, { "completion_length": 19.4, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 17.525, "completions/mean_terminated_length": 17.525, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.21245874587458746, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.281306352838874, "learning_rate": 4.810186488348657e-06, "loss": 0.0, "num_tokens": 4416338.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3090 }, { "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.7, "completions/mean_terminated_length": 17.7, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.21314631463146314, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.5700382247567177, "learning_rate": 4.8078863933732724e-06, "loss": 0.0001, "num_tokens": 4431958.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3100 }, { "completion_length": 20.8, "completions/clipped_ratio": 0.0, "completions/max_length": 20.8, "completions/max_terminated_length": 20.8, "completions/mean_length": 18.975, "completions/mean_terminated_length": 18.975, "completions/min_length": 17.6, "completions/min_terminated_length": 17.6, "epoch": 0.21383388338833884, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4087812922894956, "learning_rate": 4.805573003009511e-06, "loss": 0.0001, "num_tokens": 4446569.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3110 }, { "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.875, "completions/mean_terminated_length": 17.875, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.2145214521452145, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4133543133735658, "learning_rate": 4.80324633058447e-06, "loss": 0.0001, "num_tokens": 4462424.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3120 }, { "completion_length": 24.9, "completions/clipped_ratio": 0.0, "completions/max_length": 24.9, "completions/max_terminated_length": 24.9, "completions/mean_length": 20.725, "completions/mean_terminated_length": 20.725, "completions/min_length": 16.9, "completions/min_terminated_length": 16.9, "epoch": 0.2152090209020902, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8252270920202136, "learning_rate": 4.8009063895017606e-06, "loss": 0.0, "num_tokens": 4477101.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3130 }, { "completion_length": 22.1, "completions/clipped_ratio": 0.0, "completions/max_length": 22.1, "completions/max_terminated_length": 22.1, "completions/mean_length": 19.525, "completions/mean_terminated_length": 19.525, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.2158965896589659, "frac_reward_zero_std": 0.9, "grad_norm": 0.2921202480792999, "kl": 0.9959625884890556, "learning_rate": 4.798553193241434e-06, "loss": 0.0, "num_tokens": 4490846.0, "reward": 4.084999895095825, "reward_std": 0.017320506274700165, "rewards/coherence_reward_func/mean": 1.28499995470047, "rewards/coherence_reward_func/std": 0.017320506274700165, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3140 }, { "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.21658415841584158, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.154418208450079, "learning_rate": 4.796186755359901e-06, "loss": 0.0, "num_tokens": 4507994.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3150 }, { "completion_length": 19.1, "completions/clipped_ratio": 0.0, "completions/max_length": 19.1, "completions/max_terminated_length": 19.1, "completions/mean_length": 17.95, "completions/mean_terminated_length": 17.95, "completions/min_length": 16.9, "completions/min_terminated_length": 16.9, "epoch": 0.21727172717271728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0461080975830555, "learning_rate": 4.793807089489857e-06, "loss": 0.0, "num_tokens": 4521384.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3160 }, { "completion_length": 19.6, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.21795929592959296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2285627737641334, "learning_rate": 4.791414209340201e-06, "loss": 0.0, "num_tokens": 4535060.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3170 }, { "completion_length": 17.7, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 16.6, "completions/mean_terminated_length": 16.6, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.21864686468646866, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.393813919275999, "learning_rate": 4.789008128695959e-06, "loss": 0.0, "num_tokens": 4548072.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3180 }, { "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.2, "completions/mean_terminated_length": 17.2, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.21933443344334433, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.081873746216297, "learning_rate": 4.7865888614182e-06, "loss": 0.0, "num_tokens": 4561204.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3190 }, { "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.675, "completions/mean_terminated_length": 19.675, "completions/min_length": 18.3, "completions/min_terminated_length": 18.3, "epoch": 0.22002200220022003, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2661002337932588, "learning_rate": 4.784156421443961e-06, "loss": 0.0, "num_tokens": 4577807.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3200 }, { "completion_length": 19.8, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 17.2, "completions/mean_terminated_length": 17.2, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.2207095709570957, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1562265411019326, "learning_rate": 4.781710822786166e-06, "loss": 0.0, "num_tokens": 4589415.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3210 }, { "completion_length": 21.3, "completions/clipped_ratio": 0.0, "completions/max_length": 21.3, "completions/max_terminated_length": 21.3, "completions/mean_length": 19.3, "completions/mean_terminated_length": 19.3, "completions/min_length": 17.8, "completions/min_terminated_length": 17.8, "epoch": 0.2213971397139714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8981200024485588, "learning_rate": 4.779252079533543e-06, "loss": 0.0, "num_tokens": 4602467.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3220 }, { "completion_length": 19.3, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.22208470847084708, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1320380866527557, "learning_rate": 4.776780205850543e-06, "loss": 0.0, "num_tokens": 4618673.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3230 }, { "completion_length": 24.2, "completions/clipped_ratio": 0.0, "completions/max_length": 24.2, "completions/max_terminated_length": 24.2, "completions/mean_length": 20.8, "completions/mean_terminated_length": 20.8, "completions/min_length": 17.4, "completions/min_terminated_length": 17.4, "epoch": 0.22277227722772278, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0062423638999463, "learning_rate": 4.774295215977262e-06, "loss": 0.0, "num_tokens": 4632749.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3240 }, { "completion_length": 16.9, "completions/clipped_ratio": 0.0, "completions/max_length": 16.9, "completions/max_terminated_length": 16.9, "completions/mean_length": 15.9, "completions/mean_terminated_length": 15.9, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.22345984598459845, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.346703717112541, "learning_rate": 4.7717971242293544e-06, "loss": 0.0, "num_tokens": 4648281.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3250 }, { "completion_length": 20.7, "completions/clipped_ratio": 0.0, "completions/max_length": 20.7, "completions/max_terminated_length": 20.7, "completions/mean_length": 18.425, "completions/mean_terminated_length": 18.425, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.22414741474147415, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1709686018526555, "learning_rate": 4.769285944997953e-06, "loss": 0.0, "num_tokens": 4664606.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3260 }, { "completion_length": 18.4, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 17.45, "completions/mean_terminated_length": 17.45, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.22483498349834982, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3686413869261742, "learning_rate": 4.766761692749586e-06, "loss": 0.0, "num_tokens": 4679528.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3270 }, { "completion_length": 20.3, "completions/clipped_ratio": 0.0, "completions/max_length": 20.3, "completions/max_terminated_length": 20.3, "completions/mean_length": 17.275, "completions/mean_terminated_length": 17.275, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.22552255225522552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9682805396616458, "learning_rate": 4.764224382026094e-06, "loss": 0.0, "num_tokens": 4692875.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3280 }, { "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.825, "completions/mean_terminated_length": 18.825, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.22621012101210122, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0991791110485791, "learning_rate": 4.761674027444544e-06, "loss": 0.0, "num_tokens": 4708156.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3290 }, { "completion_length": 20.2, "completions/clipped_ratio": 0.0, "completions/max_length": 20.2, "completions/max_terminated_length": 20.2, "completions/mean_length": 18.65, "completions/mean_terminated_length": 18.65, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.2268976897689769, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0455322712659836, "learning_rate": 4.759110643697146e-06, "loss": 0.0, "num_tokens": 4722014.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3300 }, { "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.925, "completions/mean_terminated_length": 17.925, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.2275852585258526, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2300671976059676, "learning_rate": 4.756534245551172e-06, "loss": 0.0, "num_tokens": 4735443.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3310 }, { "completion_length": 16.4, "completions/clipped_ratio": 0.0, "completions/max_length": 16.4, "completions/max_terminated_length": 16.4, "completions/mean_length": 15.575, "completions/mean_terminated_length": 15.575, "completions/min_length": 14.8, "completions/min_terminated_length": 14.8, "epoch": 0.22827282728272827, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2536677211523055, "learning_rate": 4.753944847848867e-06, "loss": 0.0, "num_tokens": 4748098.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3320 }, { "completion_length": 20.3, "completions/clipped_ratio": 0.0, "completions/max_length": 20.3, "completions/max_terminated_length": 20.3, "completions/mean_length": 18.8, "completions/mean_terminated_length": 18.8, "completions/min_length": 17.7, "completions/min_terminated_length": 17.7, "epoch": 0.22896039603960397, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3008077703416348, "learning_rate": 4.751342465507362e-06, "loss": 0.0, "num_tokens": 4761274.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3330 }, { "completion_length": 22.3, "completions/clipped_ratio": 0.0, "completions/max_length": 22.3, "completions/max_terminated_length": 22.3, "completions/mean_length": 19.325, "completions/mean_terminated_length": 19.325, "completions/min_length": 16.7, "completions/min_terminated_length": 16.7, "epoch": 0.22964796479647964, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0198756888508798, "learning_rate": 4.748727113518594e-06, "loss": 0.0, "num_tokens": 4773463.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3340 }, { "completion_length": 18.2, "completions/clipped_ratio": 0.0, "completions/max_length": 18.2, "completions/max_terminated_length": 18.2, "completions/mean_length": 17.35, "completions/mean_terminated_length": 17.35, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.23033553355335534, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0109242379665375, "learning_rate": 4.746098806949213e-06, "loss": 0.0, "num_tokens": 4787017.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3350 }, { "completion_length": 17.8, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 17.025, "completions/mean_terminated_length": 17.025, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.23102310231023102, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1944296956062317, "learning_rate": 4.743457560940503e-06, "loss": 0.0, "num_tokens": 4800622.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3360 }, { "completion_length": 21.3, "completions/clipped_ratio": 0.0, "completions/max_length": 21.3, "completions/max_terminated_length": 21.3, "completions/mean_length": 17.55, "completions/mean_terminated_length": 17.55, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.23171067106710672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1894298686645925, "learning_rate": 4.740803390708284e-06, "loss": 0.0, "num_tokens": 4815392.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3370 }, { "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.5, "completions/max_terminated_length": 17.5, "completions/mean_length": 16.4, "completions/mean_terminated_length": 16.4, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.2323982398239824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3694863229990006, "learning_rate": 4.738136311542836e-06, "loss": 0.0, "num_tokens": 4831268.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3380 }, { "completion_length": 19.6, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 18.425, "completions/mean_terminated_length": 18.425, "completions/min_length": 16.7, "completions/min_terminated_length": 16.7, "epoch": 0.2330858085808581, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2427097693085671, "learning_rate": 4.7354563388088026e-06, "loss": 0.0, "num_tokens": 4846697.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3390 }, { "completion_length": 21.6, "completions/clipped_ratio": 0.0, "completions/max_length": 21.6, "completions/max_terminated_length": 21.6, "completions/mean_length": 19.075, "completions/mean_terminated_length": 19.075, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.23377337733773376, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0708073504269122, "learning_rate": 4.732763487945106e-06, "loss": 0.0, "num_tokens": 4861028.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3400 }, { "completion_length": 29.7, "completions/clipped_ratio": 0.0, "completions/max_length": 29.7, "completions/max_terminated_length": 29.7, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.23446094609460946, "frac_reward_zero_std": 0.9, "grad_norm": 0.3156428635120392, "kl": 0.9965578641742467, "learning_rate": 4.730057774464856e-06, "loss": 0.0, "num_tokens": 4877352.0, "reward": 4.092499876022339, "reward_std": 0.015000002086162567, "rewards/coherence_reward_func/mean": 1.2924999475479126, "rewards/coherence_reward_func/std": 0.01499999761581421, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3410 }, { "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.875, "completions/mean_terminated_length": 18.875, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.23514851485148514, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0397824190557003, "learning_rate": 4.727339213955265e-06, "loss": 0.0, "num_tokens": 4889631.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3420 }, { "completion_length": 21.8, "completions/clipped_ratio": 0.0, "completions/max_length": 21.8, "completions/max_terminated_length": 21.8, "completions/mean_length": 20.225, "completions/mean_terminated_length": 20.225, "completions/min_length": 18.4, "completions/min_terminated_length": 18.4, "epoch": 0.23583608360836084, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.215433156117797, "learning_rate": 4.724607822077554e-06, "loss": 0.0, "num_tokens": 4902888.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3430 }, { "completion_length": 20.7, "completions/clipped_ratio": 0.0, "completions/max_length": 20.7, "completions/max_terminated_length": 20.7, "completions/mean_length": 19.05, "completions/mean_terminated_length": 19.05, "completions/min_length": 17.2, "completions/min_terminated_length": 17.2, "epoch": 0.23652365236523654, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1691066682338715, "learning_rate": 4.7218636145668615e-06, "loss": 0.0, "num_tokens": 4916974.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3440 }, { "completion_length": 19.1, "completions/clipped_ratio": 0.0, "completions/max_length": 19.1, "completions/max_terminated_length": 19.1, "completions/mean_length": 16.2, "completions/mean_terminated_length": 16.2, "completions/min_length": 14.4, "completions/min_terminated_length": 14.4, "epoch": 0.2372112211221122, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1441729221493007, "learning_rate": 4.7191066072321575e-06, "loss": 0.0, "num_tokens": 4933274.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3450 }, { "completion_length": 22.4, "completions/clipped_ratio": 0.0, "completions/max_length": 22.4, "completions/max_terminated_length": 22.4, "completions/mean_length": 19.525, "completions/mean_terminated_length": 19.525, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.2378987898789879, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0465805977582932, "learning_rate": 4.716336815956148e-06, "loss": 0.0, "num_tokens": 4946543.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3460 }, { "completion_length": 21.6, "completions/clipped_ratio": 0.0, "completions/max_length": 21.6, "completions/max_terminated_length": 21.6, "completions/mean_length": 19.35, "completions/mean_terminated_length": 19.35, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.23858635863586358, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2777705937623978, "learning_rate": 4.713554256695188e-06, "loss": 0.0001, "num_tokens": 4959301.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3470 }, { "completion_length": 20.9, "completions/clipped_ratio": 0.0, "completions/max_length": 20.9, "completions/max_terminated_length": 20.9, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.23927392739273928, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1782601185142993, "learning_rate": 4.710758945479184e-06, "loss": 0.0, "num_tokens": 4973385.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3480 }, { "completion_length": 21.4, "completions/clipped_ratio": 0.0, "completions/max_length": 21.4, "completions/max_terminated_length": 21.4, "completions/mean_length": 18.025, "completions/mean_terminated_length": 18.025, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.23996149614961496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0537080638110639, "learning_rate": 4.7079508984115064e-06, "loss": 0.0, "num_tokens": 4986858.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3490 }, { "completion_length": 18.2, "completions/clipped_ratio": 0.0, "completions/max_length": 18.2, "completions/max_terminated_length": 18.2, "completions/mean_length": 16.15, "completions/mean_terminated_length": 16.15, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.24064906490649066, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.129155667871237, "learning_rate": 4.705130131668894e-06, "loss": 0.0, "num_tokens": 5003140.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3500 }, { "completion_length": 22.9, "completions/clipped_ratio": 0.0, "completions/max_length": 22.9, "completions/max_terminated_length": 22.9, "completions/mean_length": 19.525, "completions/mean_terminated_length": 19.525, "completions/min_length": 16.7, "completions/min_terminated_length": 16.7, "epoch": 0.24133663366336633, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2283895801752807, "learning_rate": 4.702296661501362e-06, "loss": 0.0001, "num_tokens": 5018057.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3510 }, { "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 18.275, "completions/mean_terminated_length": 18.275, "completions/min_length": 17.1, "completions/min_terminated_length": 17.1, "epoch": 0.24202420242024203, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1926358938217163, "learning_rate": 4.6994505042321096e-06, "loss": 0.0, "num_tokens": 5031064.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3520 }, { "completion_length": 17.8, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 16.6, "completions/mean_terminated_length": 16.6, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.2427117711771177, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.5111532375216483, "learning_rate": 4.696591676257422e-06, "loss": 0.0, "num_tokens": 5044100.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3530 }, { "completion_length": 21.3, "completions/clipped_ratio": 0.0, "completions/max_length": 21.3, "completions/max_terminated_length": 21.3, "completions/mean_length": 19.4, "completions/mean_terminated_length": 19.4, "completions/min_length": 17.8, "completions/min_terminated_length": 17.8, "epoch": 0.2433993399339934, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.169459306448698, "learning_rate": 4.693720194046579e-06, "loss": 0.0, "num_tokens": 5058988.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3540 }, { "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.35, "completions/mean_terminated_length": 16.35, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.24408690869086908, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.476221612840891, "learning_rate": 4.690836074141762e-06, "loss": 0.0, "num_tokens": 5075874.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3550 }, { "completion_length": 19.3, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 17.775, "completions/mean_terminated_length": 17.775, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.24477447744774478, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4168094083666802, "learning_rate": 4.687939333157954e-06, "loss": 0.0001, "num_tokens": 5089925.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3560 }, { "completion_length": 20.3, "completions/clipped_ratio": 0.0, "completions/max_length": 20.3, "completions/max_terminated_length": 20.3, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.24546204620462045, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2915988519787789, "learning_rate": 4.685029987782845e-06, "loss": 0.0, "num_tokens": 5104875.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3570 }, { "completion_length": 20.6, "completions/clipped_ratio": 0.0, "completions/max_length": 20.6, "completions/max_terminated_length": 20.6, "completions/mean_length": 17.4, "completions/mean_terminated_length": 17.4, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.24614961496149615, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1931571021676064, "learning_rate": 4.682108054776741e-06, "loss": 0.0, "num_tokens": 5118863.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3580 }, { "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.24683718371837185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1943508870899677, "learning_rate": 4.67917355097246e-06, "loss": 0.0, "num_tokens": 5132977.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3590 }, { "completion_length": 20.2, "completions/clipped_ratio": 0.0, "completions/max_length": 20.2, "completions/max_terminated_length": 20.2, "completions/mean_length": 17.9, "completions/mean_terminated_length": 17.9, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.24752475247524752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3834453955292703, "learning_rate": 4.676226493275239e-06, "loss": 0.0001, "num_tokens": 5146825.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3600 }, { "completion_length": 21.6, "completions/clipped_ratio": 0.0, "completions/max_length": 21.6, "completions/max_terminated_length": 21.6, "completions/mean_length": 18.975, "completions/mean_terminated_length": 18.975, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.24821232123212322, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 1.411560659110546, "learning_rate": 4.673266898662637e-06, "loss": 0.0001, "num_tokens": 5161888.0, "reward": 4.092499876022339, "reward_std": 0.015000002086162567, "rewards/coherence_reward_func/mean": 1.2924999475479126, "rewards/coherence_reward_func/std": 0.01499999761581421, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3610 }, { "completion_length": 18.6, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 16.7, "completions/mean_terminated_length": 16.7, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.2488998899889989, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4795636057853698, "learning_rate": 4.670294784184436e-06, "loss": 0.0, "num_tokens": 5176032.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3620 }, { "completion_length": 18.7, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 16.625, "completions/mean_terminated_length": 16.625, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.2495874587458746, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2247574172914029, "learning_rate": 4.6673101669625445e-06, "loss": 0.0, "num_tokens": 5190661.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3630 }, { "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 17.95, "completions/mean_terminated_length": 17.95, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.25027502750275027, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3278845094144345, "learning_rate": 4.664313064190893e-06, "loss": 0.0, "num_tokens": 5206219.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3640 }, { "completion_length": 17.8, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 16.575, "completions/mean_terminated_length": 16.575, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.25096259625962597, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3263515307568015, "learning_rate": 4.6613034931353445e-06, "loss": 0.0, "num_tokens": 5217886.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3650 }, { "completion_length": 17.7, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 16.8, "completions/mean_terminated_length": 16.8, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.25165016501650167, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2702551379799842, "learning_rate": 4.6582814711335874e-06, "loss": 0.0, "num_tokens": 5229738.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3660 }, { "completion_length": 24.7, "completions/clipped_ratio": 0.0, "completions/max_length": 24.7, "completions/max_terminated_length": 24.7, "completions/mean_length": 21.6, "completions/mean_terminated_length": 21.6, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.2523377337733773, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0552857838571073, "learning_rate": 4.655247015595039e-06, "loss": 0.0, "num_tokens": 5244126.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3670 }, { "completion_length": 18.2, "completions/clipped_ratio": 0.0, "completions/max_length": 18.2, "completions/max_terminated_length": 18.2, "completions/mean_length": 15.95, "completions/mean_terminated_length": 15.95, "completions/min_length": 13.5, "completions/min_terminated_length": 13.5, "epoch": 0.253025302530253, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 41.09145687818527, "learning_rate": 4.652200144000743e-06, "loss": 0.0017, "num_tokens": 5258988.0, "reward": 3.8949999094009398, "reward_std": 0.23671360015869142, "rewards/coherence_reward_func/mean": 1.23499995470047, "rewards/coherence_reward_func/std": 0.07505553364753723, "rewards/formatting_reward_func/mean": 1.9, "rewards/formatting_reward_func/std": 0.1154700517654419, "rewards/quality_reward_func/mean": 0.7600000113248825, "rewards/quality_reward_func/std": 0.046188023686408994, "step": 3680 }, { "completion_length": 21.3, "completions/clipped_ratio": 0.0, "completions/max_length": 21.3, "completions/max_terminated_length": 21.3, "completions/mean_length": 19.175, "completions/mean_terminated_length": 19.175, "completions/min_length": 17.4, "completions/min_terminated_length": 17.4, "epoch": 0.2537128712871287, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4594970896840096, "learning_rate": 4.6491408739032705e-06, "loss": 0.0001, "num_tokens": 5273603.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3690 }, { "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.5, "completions/max_terminated_length": 22.5, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.3, "completions/min_terminated_length": 17.3, "epoch": 0.2544004400440044, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1064071744680404, "learning_rate": 4.64606922292662e-06, "loss": 0.0, "num_tokens": 5288777.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3700 }, { "completion_length": 17.1, "completions/clipped_ratio": 0.0, "completions/max_length": 17.1, "completions/max_terminated_length": 17.1, "completions/mean_length": 16.55, "completions/mean_terminated_length": 16.55, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.25508800880088006, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3959959626197815, "learning_rate": 4.642985208766113e-06, "loss": 0.0, "num_tokens": 5300959.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3710 }, { "completion_length": 17.6, "completions/clipped_ratio": 0.0, "completions/max_length": 17.6, "completions/max_terminated_length": 17.6, "completions/mean_length": 16.525, "completions/mean_terminated_length": 16.525, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.25577557755775576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3369979746639729, "learning_rate": 4.639888849188295e-06, "loss": 0.0, "num_tokens": 5314908.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3720 }, { "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.5, "completions/max_terminated_length": 17.5, "completions/mean_length": 16.725, "completions/mean_terminated_length": 16.725, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.25646314631463146, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4185206890106201, "learning_rate": 4.6367801620308295e-06, "loss": 0.0, "num_tokens": 5327609.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3730 }, { "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 17.675, "completions/mean_terminated_length": 17.675, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.25715071507150716, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2852225728332995, "learning_rate": 4.633659165202398e-06, "loss": 0.0, "num_tokens": 5341592.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3740 }, { "completion_length": 19.8, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 16.9, "completions/mean_terminated_length": 16.9, "completions/min_length": 14.8, "completions/min_terminated_length": 14.8, "epoch": 0.25783828382838286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2239439487457275, "learning_rate": 4.630525876682597e-06, "loss": 0.0, "num_tokens": 5353784.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3750 }, { "completion_length": 18.2, "completions/clipped_ratio": 0.0, "completions/max_length": 18.2, "completions/max_terminated_length": 18.2, "completions/mean_length": 16.825, "completions/mean_terminated_length": 16.825, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.2585258525852585, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3233575984835624, "learning_rate": 4.627380314521833e-06, "loss": 0.0, "num_tokens": 5366529.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3760 }, { "completion_length": 19.3, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 17.6, "completions/mean_terminated_length": 17.6, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.2592134213421342, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4207659110426902, "learning_rate": 4.624222496841219e-06, "loss": 0.0001, "num_tokens": 5380945.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3770 }, { "completion_length": 19.8, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 17.35, "completions/mean_terminated_length": 17.35, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.2599009900990099, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4342102020978929, "learning_rate": 4.621052441832471e-06, "loss": 0.0001, "num_tokens": 5395375.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3780 }, { "completion_length": 17.8, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 16.8, "completions/mean_terminated_length": 16.8, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.2605885588558856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3788485825061798, "learning_rate": 4.617870167757801e-06, "loss": 0.0, "num_tokens": 5410043.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3790 }, { "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.825, "completions/mean_terminated_length": 17.825, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.26127612761276126, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3054631665349006, "learning_rate": 4.614675692949815e-06, "loss": 0.0001, "num_tokens": 5423164.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3800 }, { "completion_length": 19.6, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 16.875, "completions/mean_terminated_length": 16.875, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.26196369636963696, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 1.2541000019758939, "learning_rate": 4.611469035811404e-06, "loss": 0.0, "num_tokens": 5437159.0, "reward": 3.792499911785126, "reward_std": 0.20499999523162843, "rewards/coherence_reward_func/mean": 1.2024999558925629, "rewards/coherence_reward_func/std": 0.06499999761581421, "rewards/formatting_reward_func/mean": 1.85, "rewards/formatting_reward_func/std": 0.1, "rewards/quality_reward_func/mean": 0.7400000110268593, "rewards/quality_reward_func/std": 0.04000000059604645, "step": 3810 }, { "completion_length": 19.6, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 17.125, "completions/mean_terminated_length": 17.125, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.26265126512651266, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.126739951223135, "learning_rate": 4.60825021481564e-06, "loss": 0.0, "num_tokens": 5451712.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3820 }, { "completion_length": 19.6, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.26333883388338836, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.021160862594843, "learning_rate": 4.60501924850567e-06, "loss": 0.0, "num_tokens": 5464550.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3830 }, { "completion_length": 15.9, "completions/clipped_ratio": 0.0, "completions/max_length": 15.9, "completions/max_terminated_length": 15.9, "completions/mean_length": 15.15, "completions/mean_terminated_length": 15.15, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.264026402640264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.45318810492754, "learning_rate": 4.601776155494607e-06, "loss": 0.0, "num_tokens": 5477840.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3840 }, { "completion_length": 20.6, "completions/clipped_ratio": 0.0, "completions/max_length": 20.6, "completions/max_terminated_length": 20.6, "completions/mean_length": 17.3, "completions/mean_terminated_length": 17.3, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.2647139713971397, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0578694000840188, "learning_rate": 4.5985209544654265e-06, "loss": 0.0, "num_tokens": 5491052.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3850 }, { "completion_length": 21.473684210526315, "completions/clipped_ratio": 0.0, "completions/max_length": 21.473684210526315, "completions/max_terminated_length": 21.473684210526315, "completions/mean_length": 19.07894736842105, "completions/mean_terminated_length": 19.07894736842105, "completions/min_length": 17.42105263157895, "completions/min_terminated_length": 17.42105263157895, "epoch": 0.2654015401540154, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.208579867686096, "learning_rate": 4.595253664170852e-06, "loss": 0.0, "num_tokens": 5505996.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3860 }, { "completion_length": 21.7, "completions/clipped_ratio": 0.0, "completions/max_length": 21.7, "completions/max_terminated_length": 21.7, "completions/mean_length": 19.475, "completions/mean_terminated_length": 19.475, "completions/min_length": 17.9, "completions/min_terminated_length": 17.9, "epoch": 0.2660891089108911, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2276733338832855, "learning_rate": 4.591974303433257e-06, "loss": 0.0, "num_tokens": 5521483.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3870 }, { "completion_length": 21.9, "completions/clipped_ratio": 0.0, "completions/max_length": 21.9, "completions/max_terminated_length": 21.9, "completions/mean_length": 19.7, "completions/mean_terminated_length": 19.7, "completions/min_length": 16.8, "completions/min_terminated_length": 16.8, "epoch": 0.2667766776677668, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2526239909231662, "learning_rate": 4.5886828911445475e-06, "loss": 0.0001, "num_tokens": 5536619.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3880 }, { "completion_length": 20.4, "completions/clipped_ratio": 0.0, "completions/max_length": 20.4, "completions/max_terminated_length": 20.4, "completions/mean_length": 17.95, "completions/mean_terminated_length": 17.95, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.26746424642464245, "frac_reward_zero_std": 1.0, "grad_norm": 5.358908310881816e-05, "kl": 1.2319100320339202, "learning_rate": 4.585379446266057e-06, "loss": 0.0, "num_tokens": 5550945.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3890 }, { "completion_length": 19.6, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 18.3, "completions/mean_terminated_length": 18.3, "completions/min_length": 16.8, "completions/min_terminated_length": 16.8, "epoch": 0.26815181518151815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4535746276378632, "learning_rate": 4.582063987828438e-06, "loss": 0.0001, "num_tokens": 5563937.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3900 }, { "completion_length": 22.9, "completions/clipped_ratio": 0.0, "completions/max_length": 22.9, "completions/max_terminated_length": 22.9, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.5, "completions/min_terminated_length": 18.5, "epoch": 0.26883938393839385, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2058482803404331, "learning_rate": 4.5787365349315495e-06, "loss": 0.0001, "num_tokens": 5576467.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3910 }, { "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 17.05, "completions/mean_terminated_length": 17.05, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.26952695269526955, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.382287858426571, "learning_rate": 4.575397106744351e-06, "loss": 0.0, "num_tokens": 5591485.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3920 }, { "completion_length": 18.4, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 17.425, "completions/mean_terminated_length": 17.425, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.2702145214521452, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0665898650884629, "learning_rate": 4.5720457225047885e-06, "loss": 0.0, "num_tokens": 5605202.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3930 }, { "completion_length": 21.1, "completions/clipped_ratio": 0.0, "completions/max_length": 21.1, "completions/max_terminated_length": 21.1, "completions/mean_length": 18.375, "completions/mean_terminated_length": 18.375, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.2709020902090209, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.171196562051773, "learning_rate": 4.568682401519686e-06, "loss": 0.0, "num_tokens": 5621465.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3940 }, { "completion_length": 20.1, "completions/clipped_ratio": 0.0, "completions/max_length": 20.1, "completions/max_terminated_length": 20.1, "completions/mean_length": 18.625, "completions/mean_terminated_length": 18.625, "completions/min_length": 17.2, "completions/min_terminated_length": 17.2, "epoch": 0.2715896589658966, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4796609468758106, "learning_rate": 4.5653071631646335e-06, "loss": 0.0001, "num_tokens": 5637482.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3950 }, { "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 17.2, "completions/mean_terminated_length": 17.2, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.2722772277227723, "frac_reward_zero_std": 1.0, "grad_norm": 8.943950524553657e-05, "kl": 1.0443273723125457, "learning_rate": 4.561920026883873e-06, "loss": 0.0, "num_tokens": 5651382.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3960 }, { "completion_length": 18.9, "completions/clipped_ratio": 0.0, "completions/max_length": 18.9, "completions/max_terminated_length": 18.9, "completions/mean_length": 17.625, "completions/mean_terminated_length": 17.625, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.27296479647964794, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3861748054623604, "learning_rate": 4.5585210121901894e-06, "loss": 0.0, "num_tokens": 5664223.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3970 }, { "completion_length": 19.3, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 17.675, "completions/mean_terminated_length": 17.675, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.27365236523652364, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1854788262397051, "learning_rate": 4.5551101386648e-06, "loss": 0.0, "num_tokens": 5679882.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3980 }, { "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.475, "completions/mean_terminated_length": 17.475, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.27433993399339934, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2322146125137805, "learning_rate": 4.551687425957235e-06, "loss": 0.0, "num_tokens": 5691761.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 3990 }, { "completion_length": 19.6, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.27502750275027504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.189116670936346, "learning_rate": 4.548252893785232e-06, "loss": 0.0, "num_tokens": 5705939.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4000 }, { "completion_length": 17.6, "completions/clipped_ratio": 0.0, "completions/max_length": 17.6, "completions/max_terminated_length": 17.6, "completions/mean_length": 16.8, "completions/mean_terminated_length": 16.8, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.2757150715071507, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 1.4305979654192924, "learning_rate": 4.5448065619346175e-06, "loss": 0.0, "num_tokens": 5719755.0, "reward": 4.0099998950958256, "reward_std": 0.06000000238418579, "rewards/coherence_reward_func/mean": 1.24749995470047, "rewards/coherence_reward_func/std": 0.034999996423721313, "rewards/formatting_reward_func/mean": 1.9625, "rewards/formatting_reward_func/std": 0.025, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4010 }, { "completion_length": 18.7, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 16.825, "completions/mean_terminated_length": 16.825, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.2764026402640264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.5111463770270348, "learning_rate": 4.541348450259193e-06, "loss": 0.0001, "num_tokens": 5734692.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4020 }, { "completion_length": 21.7, "completions/clipped_ratio": 0.0, "completions/max_length": 21.7, "completions/max_terminated_length": 21.7, "completions/mean_length": 20.175, "completions/mean_terminated_length": 20.175, "completions/min_length": 17.9, "completions/min_terminated_length": 17.9, "epoch": 0.2770902090209021, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0053938809782266, "learning_rate": 4.5378785786806225e-06, "loss": 0.0, "num_tokens": 5747383.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4030 }, { "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 18.725, "completions/mean_terminated_length": 18.725, "completions/min_length": 17.3, "completions/min_terminated_length": 17.3, "epoch": 0.2777777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8882788009941578, "learning_rate": 4.534396967188318e-06, "loss": 0.0, "num_tokens": 5761364.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4040 }, { "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.125, "completions/mean_terminated_length": 16.125, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.2784653465346535, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4726532608270646, "learning_rate": 4.530903635839323e-06, "loss": 0.0, "num_tokens": 5772653.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4050 }, { "completion_length": 20.7, "completions/clipped_ratio": 0.0, "completions/max_length": 20.7, "completions/max_terminated_length": 20.7, "completions/mean_length": 18.05, "completions/mean_terminated_length": 18.05, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.27915291529152914, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.943469512835145, "learning_rate": 4.527398604758195e-06, "loss": 0.0, "num_tokens": 5785743.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4060 }, { "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 16.9, "completions/mean_terminated_length": 16.9, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.27984048404840484, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1530781347304582, "learning_rate": 4.523881894136896e-06, "loss": 0.0, "num_tokens": 5799751.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4070 }, { "completion_length": 19.3, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 17.7, "completions/mean_terminated_length": 17.7, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.28052805280528054, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9560482200235129, "learning_rate": 4.52035352423467e-06, "loss": 0.0, "num_tokens": 5812563.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4080 }, { "completion_length": 16.7, "completions/clipped_ratio": 0.0, "completions/max_length": 16.7, "completions/max_terminated_length": 16.7, "completions/mean_length": 15.625, "completions/mean_terminated_length": 15.625, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.28121562156215624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0881683185696602, "learning_rate": 4.516813515377927e-06, "loss": 0.0, "num_tokens": 5827188.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4090 }, { "completion_length": 19.9, "completions/clipped_ratio": 0.0, "completions/max_length": 19.9, "completions/max_terminated_length": 19.9, "completions/mean_length": 17.925, "completions/mean_terminated_length": 17.925, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.2819031903190319, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1691953319124877, "learning_rate": 4.513261887960129e-06, "loss": 0.0, "num_tokens": 5843893.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4100 }, { "completion_length": 18.7, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 17.95, "completions/mean_terminated_length": 17.95, "completions/min_length": 17.1, "completions/min_terminated_length": 17.1, "epoch": 0.2825907590759076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1508332364261151, "learning_rate": 4.50969866244167e-06, "loss": 0.0, "num_tokens": 5859147.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4110 }, { "completion_length": 17.8, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 16.7, "completions/mean_terminated_length": 16.7, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.2832783278327833, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.299021417624317, "learning_rate": 4.5061238593497604e-06, "loss": 0.0, "num_tokens": 5874075.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4120 }, { "completion_length": 19.6, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 17.475, "completions/mean_terminated_length": 17.475, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.283965896589659, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1608231782913208, "learning_rate": 4.502537499278306e-06, "loss": 0.0, "num_tokens": 5890154.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4130 }, { "completion_length": 21.8, "completions/clipped_ratio": 0.0, "completions/max_length": 21.8, "completions/max_terminated_length": 21.8, "completions/mean_length": 18.975, "completions/mean_terminated_length": 18.975, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.28465346534653463, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0544112488627433, "learning_rate": 4.498939602887791e-06, "loss": 0.0, "num_tokens": 5905281.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4140 }, { "completion_length": 21.2, "completions/clipped_ratio": 0.0, "completions/max_length": 21.2, "completions/max_terminated_length": 21.2, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 16.7, "completions/min_terminated_length": 16.7, "epoch": 0.28534103410341033, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1131783194839955, "learning_rate": 4.495330190905158e-06, "loss": 0.0, "num_tokens": 5921091.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4150 }, { "completion_length": 18.4, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 16.6, "completions/mean_terminated_length": 16.6, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.28602860286028603, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3294072329998017, "learning_rate": 4.491709284123688e-06, "loss": 0.0, "num_tokens": 5936327.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4160 }, { "completion_length": 18.6, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 17.3, "completions/mean_terminated_length": 17.3, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.28671617161716173, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2027022048830986, "learning_rate": 4.488076903402886e-06, "loss": 0.0, "num_tokens": 5950739.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4170 }, { "completion_length": 17.6, "completions/clipped_ratio": 0.0, "completions/max_length": 17.6, "completions/max_terminated_length": 17.6, "completions/mean_length": 16.275, "completions/mean_terminated_length": 16.275, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.28740374037403743, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3033845350146294, "learning_rate": 4.484433069668355e-06, "loss": 0.0, "num_tokens": 5966054.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4180 }, { "completion_length": 21.1, "completions/clipped_ratio": 0.0, "completions/max_length": 21.1, "completions/max_terminated_length": 21.1, "completions/mean_length": 18.075, "completions/mean_terminated_length": 18.075, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.2880913091309131, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3826805047690869, "learning_rate": 4.480777803911672e-06, "loss": 0.0001, "num_tokens": 5981269.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4190 }, { "completion_length": 19.6, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 17.375, "completions/mean_terminated_length": 17.375, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.2887788778877888, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2301786564290524, "learning_rate": 4.477111127190281e-06, "loss": 0.0, "num_tokens": 5995408.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4200 }, { "completion_length": 19.263157894736842, "completions/clipped_ratio": 0.0, "completions/max_length": 19.263157894736842, "completions/max_terminated_length": 19.263157894736842, "completions/mean_length": 16.842105263157894, "completions/mean_terminated_length": 16.842105263157894, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.2894664466446645, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.231401851302699, "learning_rate": 4.473433060627356e-06, "loss": 0.0, "num_tokens": 6010277.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4210 }, { "completion_length": 19.4, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 16.675, "completions/mean_terminated_length": 16.675, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.2901540154015402, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3109652653336525, "learning_rate": 4.4697436254116876e-06, "loss": 0.0, "num_tokens": 6024628.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4220 }, { "completion_length": 17.3, "completions/clipped_ratio": 0.0, "completions/max_length": 17.3, "completions/max_terminated_length": 17.3, "completions/mean_length": 15.65, "completions/mean_terminated_length": 15.65, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.2908415841584158, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1730713717639447, "learning_rate": 4.4660428427975614e-06, "loss": 0.0, "num_tokens": 6039174.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4230 }, { "completion_length": 19.4, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 16.725, "completions/mean_terminated_length": 16.725, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.2915291529152915, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.115500158071518, "learning_rate": 4.462330734104633e-06, "loss": 0.0, "num_tokens": 6052447.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4240 }, { "completion_length": 20.7, "completions/clipped_ratio": 0.0, "completions/max_length": 20.7, "completions/max_terminated_length": 20.7, "completions/mean_length": 17.95, "completions/mean_terminated_length": 17.95, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.2922167216721672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2464444026350976, "learning_rate": 4.458607320717805e-06, "loss": 0.0, "num_tokens": 6066089.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4250 }, { "completion_length": 19.6, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 17.95, "completions/mean_terminated_length": 17.95, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.2929042904290429, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0488379423040897, "learning_rate": 4.454872624087105e-06, "loss": 0.0, "num_tokens": 6079355.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4260 }, { "completion_length": 19.2, "completions/clipped_ratio": 0.0, "completions/max_length": 19.2, "completions/max_terminated_length": 19.2, "completions/mean_length": 17.55, "completions/mean_terminated_length": 17.55, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.29359185918591857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.088923167437315, "learning_rate": 4.4511266657275624e-06, "loss": 0.0, "num_tokens": 6094181.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4270 }, { "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.475, "completions/mean_terminated_length": 18.475, "completions/min_length": 17.2, "completions/min_terminated_length": 17.2, "epoch": 0.29427942794279427, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1366772107779979, "learning_rate": 4.447369467219081e-06, "loss": 0.0, "num_tokens": 6107348.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4280 }, { "completion_length": 18.1, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 16.125, "completions/mean_terminated_length": 16.125, "completions/min_length": 14.6, "completions/min_terminated_length": 14.6, "epoch": 0.29496699669966997, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.187676628679037, "learning_rate": 4.443601050206322e-06, "loss": 0.0, "num_tokens": 6120793.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4290 }, { "completion_length": 16.8, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 15.1, "completions/mean_terminated_length": 15.1, "completions/min_length": 14.1, "completions/min_terminated_length": 14.1, "epoch": 0.29565456545654567, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3925855614244937, "learning_rate": 4.439821436398573e-06, "loss": 0.0, "num_tokens": 6132273.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4300 }, { "completion_length": 17.7, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 16.8, "completions/mean_terminated_length": 16.8, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.2963421342134213, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3798075836151837, "learning_rate": 4.436030647569621e-06, "loss": 0.0, "num_tokens": 6147289.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4310 }, { "completion_length": 19.7, "completions/clipped_ratio": 0.0, "completions/max_length": 19.7, "completions/max_terminated_length": 19.7, "completions/mean_length": 17.325, "completions/mean_terminated_length": 17.325, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.297029702970297, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1426802188158036, "learning_rate": 4.432228705557634e-06, "loss": 0.0, "num_tokens": 6161754.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4320 }, { "completion_length": 19.7, "completions/clipped_ratio": 0.0, "completions/max_length": 19.7, "completions/max_terminated_length": 19.7, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.2977172717271727, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0807349354028701, "learning_rate": 4.428415632265033e-06, "loss": 0.0, "num_tokens": 6174450.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4330 }, { "completion_length": 17.7, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 16.4, "completions/mean_terminated_length": 16.4, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.2984048404840484, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0977762714028358, "learning_rate": 4.424591449658362e-06, "loss": 0.0, "num_tokens": 6188958.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4340 }, { "completion_length": 18.9, "completions/clipped_ratio": 0.0, "completions/max_length": 18.9, "completions/max_terminated_length": 18.9, "completions/mean_length": 17.325, "completions/mean_terminated_length": 17.325, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.2990924092409241, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0881227478384972, "learning_rate": 4.420756179768165e-06, "loss": 0.0, "num_tokens": 6204739.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4350 }, { "completion_length": 18.7, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 18.075, "completions/mean_terminated_length": 18.075, "completions/min_length": 17.5, "completions/min_terminated_length": 17.5, "epoch": 0.29977997799779976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1955605536699294, "learning_rate": 4.4169098446888594e-06, "loss": 0.0, "num_tokens": 6220802.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4360 }, { "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 16.85, "completions/mean_terminated_length": 16.85, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.30046754675467546, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0562300879508257, "learning_rate": 4.413052466578605e-06, "loss": 0.0, "num_tokens": 6235288.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4370 }, { "completion_length": 18.6, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 16.625, "completions/mean_terminated_length": 16.625, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.30115511551155116, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2873560920357705, "learning_rate": 4.409184067659181e-06, "loss": 0.0, "num_tokens": 6251829.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4380 }, { "completion_length": 17.3, "completions/clipped_ratio": 0.0, "completions/max_length": 17.3, "completions/max_terminated_length": 17.3, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.30184268426842686, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.285911639779806, "learning_rate": 4.4053046702158555e-06, "loss": 0.0, "num_tokens": 6267491.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4390 }, { "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.075, "completions/mean_terminated_length": 20.075, "completions/min_length": 17.9, "completions/min_terminated_length": 17.9, "epoch": 0.3025302530253025, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.6993859726935625, "learning_rate": 4.401414296597256e-06, "loss": 0.0, "num_tokens": 6280378.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4400 }, { "completion_length": 22.1, "completions/clipped_ratio": 0.0, "completions/max_length": 22.1, "completions/max_terminated_length": 22.1, "completions/mean_length": 18.675, "completions/mean_terminated_length": 18.675, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.3032178217821782, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.113627065718174, "learning_rate": 4.397512969215243e-06, "loss": 0.0, "num_tokens": 6295541.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4410 }, { "completion_length": 17.1, "completions/clipped_ratio": 0.0, "completions/max_length": 17.1, "completions/max_terminated_length": 17.1, "completions/mean_length": 15.95, "completions/mean_terminated_length": 15.95, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.3039053905390539, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4492276966571809, "learning_rate": 4.393600710544781e-06, "loss": 0.0, "num_tokens": 6311123.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4420 }, { "completion_length": 19.7, "completions/clipped_ratio": 0.0, "completions/max_length": 19.7, "completions/max_terminated_length": 19.7, "completions/mean_length": 17.675, "completions/mean_terminated_length": 17.675, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.3045929592959296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1876488611102105, "learning_rate": 4.389677543123807e-06, "loss": 0.0, "num_tokens": 6325982.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4430 }, { "completion_length": 19.8, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 18.125, "completions/mean_terminated_length": 18.125, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.30528052805280526, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.5164424151182174, "learning_rate": 4.385743489553101e-06, "loss": 0.0001, "num_tokens": 6340319.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4440 }, { "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.3, "completions/mean_terminated_length": 18.3, "completions/min_length": 17.3, "completions/min_terminated_length": 17.3, "epoch": 0.30596809680968096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.173893976211548, "learning_rate": 4.3817985724961585e-06, "loss": 0.0, "num_tokens": 6356051.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4450 }, { "completion_length": 18.4, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 16.725, "completions/mean_terminated_length": 16.725, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.30665566556655666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.21855476051569, "learning_rate": 4.3778428146790565e-06, "loss": 0.0, "num_tokens": 6368848.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4460 }, { "completion_length": 17.8, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 16.675, "completions/mean_terminated_length": 16.675, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.30734323432343236, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3003081649541854, "learning_rate": 4.373876238890322e-06, "loss": 0.0, "num_tokens": 6385515.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4470 }, { "completion_length": 18.7, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 16.4, "completions/mean_terminated_length": 16.4, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.30803080308030806, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3788954310119153, "learning_rate": 4.369898867980809e-06, "loss": 0.0, "num_tokens": 6400035.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4480 }, { "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.3087183718371837, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0902920335531234, "learning_rate": 4.365910724863554e-06, "loss": 0.0, "num_tokens": 6415169.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4490 }, { "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 16.9, "completions/mean_terminated_length": 16.9, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.3094059405940594, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2483339451253415, "learning_rate": 4.361911832513652e-06, "loss": 0.0, "num_tokens": 6427085.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4500 }, { "completion_length": 19.3, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 17.725, "completions/mean_terminated_length": 17.725, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.3100935093509351, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2941459499299526, "learning_rate": 4.357902213968126e-06, "loss": 0.0, "num_tokens": 6442638.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4510 }, { "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 17.2, "completions/mean_terminated_length": 17.2, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.3107810781078108, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2603263229131698, "learning_rate": 4.353881892325787e-06, "loss": 0.0, "num_tokens": 6456242.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4520 }, { "completion_length": 19.4, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 17.425, "completions/mean_terminated_length": 17.425, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.31146864686468645, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9698817508295179, "learning_rate": 4.349850890747109e-06, "loss": 0.0, "num_tokens": 6470143.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4530 }, { "completion_length": 18.9, "completions/clipped_ratio": 0.0, "completions/max_length": 18.9, "completions/max_terminated_length": 18.9, "completions/mean_length": 16.95, "completions/mean_terminated_length": 16.95, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.31215621562156215, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1165247913450003, "learning_rate": 4.345809232454088e-06, "loss": 0.0, "num_tokens": 6487277.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4540 }, { "completion_length": 19.3, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 17.325, "completions/mean_terminated_length": 17.325, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.31284378437843785, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2503239408135414, "learning_rate": 4.341756940730113e-06, "loss": 0.0, "num_tokens": 6501746.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4550 }, { "completion_length": 19.2, "completions/clipped_ratio": 0.0, "completions/max_length": 19.2, "completions/max_terminated_length": 19.2, "completions/mean_length": 17.325, "completions/mean_terminated_length": 17.325, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.31353135313531355, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2415374740958214, "learning_rate": 4.33769403891983e-06, "loss": 0.0, "num_tokens": 6515103.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4560 }, { "completion_length": 17.6, "completions/clipped_ratio": 0.0, "completions/max_length": 17.6, "completions/max_terminated_length": 17.6, "completions/mean_length": 16.425, "completions/mean_terminated_length": 16.425, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.3142189218921892, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1866839185357094, "learning_rate": 4.33362055042901e-06, "loss": 0.0, "num_tokens": 6527800.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4570 }, { "completion_length": 17.9, "completions/clipped_ratio": 0.0, "completions/max_length": 17.9, "completions/max_terminated_length": 17.9, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.3149064906490649, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.348712769150734, "learning_rate": 4.32953649872441e-06, "loss": 0.0, "num_tokens": 6542468.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4580 }, { "completion_length": 20.1, "completions/clipped_ratio": 0.0, "completions/max_length": 20.1, "completions/max_terminated_length": 20.1, "completions/mean_length": 17.575, "completions/mean_terminated_length": 17.575, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.3155940594059406, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2476600848138333, "learning_rate": 4.325441907333642e-06, "loss": 0.0, "num_tokens": 6556839.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4590 }, { "completion_length": 18.9, "completions/clipped_ratio": 0.0, "completions/max_length": 18.9, "completions/max_terminated_length": 18.9, "completions/mean_length": 17.275, "completions/mean_terminated_length": 17.275, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.3162816281628163, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2813825011253357, "learning_rate": 4.321336799845034e-06, "loss": 0.0, "num_tokens": 6570610.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4600 }, { "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 17.325, "completions/mean_terminated_length": 17.325, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.31696919691969194, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3337368354201318, "learning_rate": 4.317221199907496e-06, "loss": 0.0, "num_tokens": 6583899.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4610 }, { "completion_length": 18.6, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 16.525, "completions/mean_terminated_length": 16.525, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.31765676567656764, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.158595709502697, "learning_rate": 4.313095131230385e-06, "loss": 0.0, "num_tokens": 6598792.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4620 }, { "completion_length": 18.6, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 17.2, "completions/mean_terminated_length": 17.2, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.31834433443344334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4270890690386295, "learning_rate": 4.308958617583364e-06, "loss": 0.0001, "num_tokens": 6614748.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4630 }, { "completion_length": 18.1, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 16.8, "completions/mean_terminated_length": 16.8, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.31903190319031904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9190359987318516, "learning_rate": 4.304811682796271e-06, "loss": 0.0, "num_tokens": 6628996.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4640 }, { "completion_length": 18.6, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 16.775, "completions/mean_terminated_length": 16.775, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.31971947194719474, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1849085062742233, "learning_rate": 4.300654350758977e-06, "loss": 0.0, "num_tokens": 6645647.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4650 }, { "completion_length": 17.8, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 16.55, "completions/mean_terminated_length": 16.55, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.3204070407040704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.494172091037035, "learning_rate": 4.296486645421249e-06, "loss": 0.0, "num_tokens": 6659029.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4660 }, { "completion_length": 20.3, "completions/clipped_ratio": 0.0, "completions/max_length": 20.3, "completions/max_terminated_length": 20.3, "completions/mean_length": 17.575, "completions/mean_terminated_length": 17.575, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.3210946094609461, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8454300031065941, "learning_rate": 4.292308590792616e-06, "loss": 0.0, "num_tokens": 6675132.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4670 }, { "completion_length": 19.9, "completions/clipped_ratio": 0.0, "completions/max_length": 19.9, "completions/max_terminated_length": 19.9, "completions/mean_length": 17.95, "completions/mean_terminated_length": 17.95, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.3217821782178218, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3840799629688263, "learning_rate": 4.288120210942223e-06, "loss": 0.0001, "num_tokens": 6690810.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4680 }, { "completion_length": 17.4, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 17.025, "completions/mean_terminated_length": 17.025, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.3224697469746975, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2411757558584213, "learning_rate": 4.283921529998702e-06, "loss": 0.0, "num_tokens": 6708547.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4690 }, { "completion_length": 18.8, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 16.7, "completions/mean_terminated_length": 16.7, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.32315731573157314, "frac_reward_zero_std": 1.0, "grad_norm": 0.00014543857832904905, "kl": 1.0443045005202294, "learning_rate": 4.2797125721500275e-06, "loss": 0.0, "num_tokens": 6722499.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4700 }, { "completion_length": 18.6, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 17.1, "completions/mean_terminated_length": 17.1, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.32384488448844884, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3368266090750693, "learning_rate": 4.275493361643374e-06, "loss": 0.0, "num_tokens": 6737003.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4710 }, { "completion_length": 18.8, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 17.375, "completions/mean_terminated_length": 17.375, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.32453245324532454, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0547638040734455, "learning_rate": 4.271263922784981e-06, "loss": 0.0, "num_tokens": 6750866.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4720 }, { "completion_length": 19.2, "completions/clipped_ratio": 0.0, "completions/max_length": 19.2, "completions/max_terminated_length": 19.2, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.32522002200220024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3946014061570167, "learning_rate": 4.267024279940017e-06, "loss": 0.0001, "num_tokens": 6765004.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4730 }, { "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 18.075, "completions/mean_terminated_length": 18.075, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.3259075907590759, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2133473329246045, "learning_rate": 4.262774457532428e-06, "loss": 0.0, "num_tokens": 6780903.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4740 }, { "completion_length": 19.2, "completions/clipped_ratio": 0.0, "completions/max_length": 19.2, "completions/max_terminated_length": 19.2, "completions/mean_length": 16.9, "completions/mean_terminated_length": 16.9, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.3265951595159516, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9682459566742182, "learning_rate": 4.2585144800448055e-06, "loss": 0.0, "num_tokens": 6794935.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4750 }, { "completion_length": 19.4, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 16.65, "completions/mean_terminated_length": 16.65, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.3272827282728273, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4514311589300632, "learning_rate": 4.254244372018244e-06, "loss": 0.0001, "num_tokens": 6808745.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4760 }, { "completion_length": 19.1, "completions/clipped_ratio": 0.0, "completions/max_length": 19.1, "completions/max_terminated_length": 19.1, "completions/mean_length": 16.875, "completions/mean_terminated_length": 16.875, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.327970297029703, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.065644410997629, "learning_rate": 4.249964158052195e-06, "loss": 0.0, "num_tokens": 6822524.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4770 }, { "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.625, "completions/mean_terminated_length": 17.625, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.3286578657865787, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3550350315868855, "learning_rate": 4.2456738628043324e-06, "loss": 0.0, "num_tokens": 6838473.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4780 }, { "completion_length": 19.9, "completions/clipped_ratio": 0.0, "completions/max_length": 19.9, "completions/max_terminated_length": 19.9, "completions/mean_length": 17.7, "completions/mean_terminated_length": 17.7, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.32934543454345433, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3020609110593795, "learning_rate": 4.241373510990406e-06, "loss": 0.0, "num_tokens": 6852517.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4790 }, { "completion_length": 18.8, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 17.325, "completions/mean_terminated_length": 17.325, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.33003300330033003, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2051956176757812, "learning_rate": 4.237063127384099e-06, "loss": 0.0, "num_tokens": 6866874.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4800 }, { "completion_length": 19.6, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 18.325, "completions/mean_terminated_length": 18.325, "completions/min_length": 17.2, "completions/min_terminated_length": 17.2, "epoch": 0.33072057205720573, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2347914427518845, "learning_rate": 4.232742736816887e-06, "loss": 0.0, "num_tokens": 6883619.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4810 }, { "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.8, "completions/min_terminated_length": 17.8, "epoch": 0.33140814081408143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8823791073635221, "learning_rate": 4.228412364177893e-06, "loss": 0.0, "num_tokens": 6897733.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4820 }, { "completion_length": 19.2, "completions/clipped_ratio": 0.0, "completions/max_length": 19.2, "completions/max_terminated_length": 19.2, "completions/mean_length": 17.65, "completions/mean_terminated_length": 17.65, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.3320957095709571, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1724223725497722, "learning_rate": 4.2240720344137476e-06, "loss": 0.0, "num_tokens": 6911055.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4830 }, { "completion_length": 19.3, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 17.575, "completions/mean_terminated_length": 17.575, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.3327832783278328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8990753037855029, "learning_rate": 4.21972177252844e-06, "loss": 0.0, "num_tokens": 6923818.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4840 }, { "completion_length": 21.4, "completions/clipped_ratio": 0.0, "completions/max_length": 21.4, "completions/max_terminated_length": 21.4, "completions/mean_length": 19.175, "completions/mean_terminated_length": 19.175, "completions/min_length": 17.4, "completions/min_terminated_length": 17.4, "epoch": 0.3334708470847085, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9704873599112034, "learning_rate": 4.2153616035831806e-06, "loss": 0.0, "num_tokens": 6940993.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4850 }, { "completion_length": 18.6, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 17.375, "completions/mean_terminated_length": 17.375, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.3341584158415842, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0528290897607804, "learning_rate": 4.210991552696247e-06, "loss": 0.0, "num_tokens": 6957752.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4860 }, { "completion_length": 18.7, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 16.925, "completions/mean_terminated_length": 16.925, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.3348459845984598, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9078183400444686, "learning_rate": 4.2066116450428525e-06, "loss": 0.0, "num_tokens": 6973089.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4870 }, { "completion_length": 18.1, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.3355335533553355, "frac_reward_zero_std": 1.0, "grad_norm": 2.7470434361021034e-05, "kl": 1.577232411503792, "learning_rate": 4.202221905854989e-06, "loss": 0.0001, "num_tokens": 6989239.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4880 }, { "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.625, "completions/mean_terminated_length": 15.625, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.3362211221122112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1079448973294348, "learning_rate": 4.197822360421286e-06, "loss": 0.0, "num_tokens": 7003600.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4890 }, { "completion_length": 18.9, "completions/clipped_ratio": 0.0, "completions/max_length": 18.9, "completions/max_terminated_length": 18.9, "completions/mean_length": 16.425, "completions/mean_terminated_length": 16.425, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.3369086908690869, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4668400838971138, "learning_rate": 4.193413034086868e-06, "loss": 0.0001, "num_tokens": 7018585.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4900 }, { "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 17.475, "completions/mean_terminated_length": 17.475, "completions/min_length": 16.7, "completions/min_terminated_length": 16.7, "epoch": 0.33759625962596257, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.306707089813426, "learning_rate": 4.188993952253205e-06, "loss": 0.0, "num_tokens": 7033004.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4910 }, { "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.975, "completions/mean_terminated_length": 17.975, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.33828382838283827, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9754183698445559, "learning_rate": 4.1845651403779655e-06, "loss": 0.0, "num_tokens": 7046931.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4920 }, { "completion_length": 19.9, "completions/clipped_ratio": 0.0, "completions/max_length": 19.9, "completions/max_terminated_length": 19.9, "completions/mean_length": 17.925, "completions/mean_terminated_length": 17.925, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.33897139713971397, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.38096314817667, "learning_rate": 4.180126623974874e-06, "loss": 0.0001, "num_tokens": 7061420.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4930 }, { "completion_length": 18.3, "completions/clipped_ratio": 0.0, "completions/max_length": 18.3, "completions/max_terminated_length": 18.3, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.33965896589658967, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9460880151760648, "learning_rate": 4.175678428613557e-06, "loss": 0.0, "num_tokens": 7076598.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4940 }, { "completion_length": 20.3, "completions/clipped_ratio": 0.0, "completions/max_length": 20.3, "completions/max_terminated_length": 20.3, "completions/mean_length": 17.275, "completions/mean_terminated_length": 17.275, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.34034653465346537, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1463233292102815, "learning_rate": 4.171220579919406e-06, "loss": 0.0, "num_tokens": 7091077.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4950 }, { "completion_length": 16.1, "completions/clipped_ratio": 0.0, "completions/max_length": 16.1, "completions/max_terminated_length": 16.1, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.8, "completions/min_terminated_length": 14.8, "epoch": 0.341034103410341, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9677249977365137, "learning_rate": 4.16675310357342e-06, "loss": 0.0, "num_tokens": 7101725.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4960 }, { "completion_length": 18.8, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.3417216721672167, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3026058718562126, "learning_rate": 4.162276025312059e-06, "loss": 0.0, "num_tokens": 7117885.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4970 }, { "completion_length": 21.8, "completions/clipped_ratio": 0.0, "completions/max_length": 21.8, "completions/max_terminated_length": 21.8, "completions/mean_length": 19.9, "completions/mean_terminated_length": 19.9, "completions/min_length": 17.7, "completions/min_terminated_length": 17.7, "epoch": 0.3424092409240924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2132138408720494, "learning_rate": 4.157789370927104e-06, "loss": 0.0, "num_tokens": 7132993.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4980 }, { "completion_length": 17.8, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 16.625, "completions/mean_terminated_length": 16.625, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.3430968096809681, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3136692702770234, "learning_rate": 4.153293166265502e-06, "loss": 0.0, "num_tokens": 7148478.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 4990 }, { "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.7, "completions/mean_terminated_length": 16.7, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.34378437843784376, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.5792409382760524, "learning_rate": 4.1487874372292106e-06, "loss": 0.0001, "num_tokens": 7163946.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5000 }, { "completion_length": 21.3, "completions/clipped_ratio": 0.0, "completions/max_length": 21.3, "completions/max_terminated_length": 21.3, "completions/mean_length": 19.075, "completions/mean_terminated_length": 19.075, "completions/min_length": 17.1, "completions/min_terminated_length": 17.1, "epoch": 0.34447194719471946, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0890948809683323, "learning_rate": 4.1442722097750645e-06, "loss": 0.0, "num_tokens": 7177981.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5010 }, { "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.075, "completions/mean_terminated_length": 16.075, "completions/min_length": 14.1, "completions/min_terminated_length": 14.1, "epoch": 0.34515951595159516, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2117942936718464, "learning_rate": 4.139747509914613e-06, "loss": 0.0, "num_tokens": 7191344.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5020 }, { "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 16.925, "completions/mean_terminated_length": 16.925, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.34584708470847086, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3905832149088382, "learning_rate": 4.135213363713976e-06, "loss": 0.0, "num_tokens": 7207557.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5030 }, { "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 17.575, "completions/mean_terminated_length": 17.575, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.3465346534653465, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4240806803107262, "learning_rate": 4.13066979729369e-06, "loss": 0.0001, "num_tokens": 7221180.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5040 }, { "completion_length": 18.8, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 17.125, "completions/mean_terminated_length": 17.125, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.3472222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0334409718617508, "learning_rate": 4.126116836828563e-06, "loss": 0.0, "num_tokens": 7235709.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5050 }, { "completion_length": 18.8, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.3479097909790979, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3188940420746804, "learning_rate": 4.121554508547518e-06, "loss": 0.0, "num_tokens": 7248869.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5060 }, { "completion_length": 17.8, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 16.575, "completions/mean_terminated_length": 16.575, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.3485973597359736, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 1.4126264125108718, "learning_rate": 4.116982838733449e-06, "loss": 0.0001, "num_tokens": 7260688.0, "reward": 4.074999904632568, "reward_std": 0.028867512941360474, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.7750000119209289, "rewards/quality_reward_func/std": 0.028867512941360474, "step": 5070 }, { "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 18.65, "completions/mean_terminated_length": 18.65, "completions/min_length": 16.8, "completions/min_terminated_length": 16.8, "epoch": 0.3492849284928493, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.5038474194705487, "learning_rate": 4.112401853723058e-06, "loss": 0.0001, "num_tokens": 7274190.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5080 }, { "completion_length": 18.6, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 16.325, "completions/mean_terminated_length": 16.325, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.34997249724972496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1819006368517875, "learning_rate": 4.107811579906718e-06, "loss": 0.0, "num_tokens": 7289051.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5090 }, { "completion_length": 18.3, "completions/clipped_ratio": 0.0, "completions/max_length": 18.3, "completions/max_terminated_length": 18.3, "completions/mean_length": 16.375, "completions/mean_terminated_length": 16.375, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.35066006600660066, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1194199629127979, "learning_rate": 4.103212043728308e-06, "loss": 0.0, "num_tokens": 7305174.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5100 }, { "completion_length": 19.4, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 17.225, "completions/mean_terminated_length": 17.225, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.35134763476347636, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9471234813332557, "learning_rate": 4.09860327168507e-06, "loss": 0.0, "num_tokens": 7317111.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5110 }, { "completion_length": 19.8, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 17.45, "completions/mean_terminated_length": 17.45, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.35203520352035206, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.301464532315731, "learning_rate": 4.093985290327448e-06, "loss": 0.0, "num_tokens": 7332653.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5120 }, { "completion_length": 17.8, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 16.325, "completions/mean_terminated_length": 16.325, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.3527227722772277, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.5016500294208526, "learning_rate": 4.089358126258943e-06, "loss": 0.0001, "num_tokens": 7347698.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5130 }, { "completion_length": 19.6, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 17.275, "completions/mean_terminated_length": 17.275, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.3534103410341034, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2618799805641174, "learning_rate": 4.084721806135956e-06, "loss": 0.0, "num_tokens": 7362377.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5140 }, { "completion_length": 18.3, "completions/clipped_ratio": 0.0, "completions/max_length": 18.3, "completions/max_terminated_length": 18.3, "completions/mean_length": 16.775, "completions/mean_terminated_length": 16.775, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.3540979097909791, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2617546994239093, "learning_rate": 4.080076356667633e-06, "loss": 0.0, "num_tokens": 7376940.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5150 }, { "completion_length": 16.57894736842105, "completions/clipped_ratio": 0.0, "completions/max_length": 16.57894736842105, "completions/max_terminated_length": 16.57894736842105, "completions/mean_length": 15.605263157894736, "completions/mean_terminated_length": 15.605263157894736, "completions/min_length": 15.105263157894736, "completions/min_terminated_length": 15.105263157894736, "epoch": 0.3547854785478548, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2172791863742627, "learning_rate": 4.075421804615715e-06, "loss": 0.0, "num_tokens": 7391547.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5160 }, { "completion_length": 19.8, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 17.15, "completions/mean_terminated_length": 17.15, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.35547304730473045, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9243504330515862, "learning_rate": 4.070758176794378e-06, "loss": 0.0, "num_tokens": 7406345.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5170 }, { "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 18.55, "completions/mean_terminated_length": 18.55, "completions/min_length": 16.7, "completions/min_terminated_length": 16.7, "epoch": 0.35616061606160615, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3112298667430877, "learning_rate": 4.066085500070087e-06, "loss": 0.0001, "num_tokens": 7420439.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5180 }, { "completion_length": 18.2, "completions/clipped_ratio": 0.0, "completions/max_length": 18.2, "completions/max_terminated_length": 18.2, "completions/mean_length": 16.875, "completions/mean_terminated_length": 16.875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.35684818481848185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.347011312842369, "learning_rate": 4.061403801361432e-06, "loss": 0.0, "num_tokens": 7435522.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5190 }, { "completion_length": 20.8, "completions/clipped_ratio": 0.0, "completions/max_length": 20.8, "completions/max_terminated_length": 20.8, "completions/mean_length": 18.325, "completions/mean_terminated_length": 18.325, "completions/min_length": 16.8, "completions/min_terminated_length": 16.8, "epoch": 0.35753575357535755, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2339761398732663, "learning_rate": 4.0567131076389795e-06, "loss": 0.0, "num_tokens": 7451491.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5200 }, { "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.1, "completions/mean_terminated_length": 16.1, "completions/min_length": 14.3, "completions/min_terminated_length": 14.3, "epoch": 0.3582233223322332, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1732663363218307, "learning_rate": 4.052013445925116e-06, "loss": 0.0, "num_tokens": 7466091.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5210 }, { "completion_length": 18.8, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 17.125, "completions/mean_terminated_length": 17.125, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.3589108910891089, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2563588432967663, "learning_rate": 4.0473048432938875e-06, "loss": 0.0, "num_tokens": 7481236.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5220 }, { "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 18.2, "completions/mean_terminated_length": 18.2, "completions/min_length": 16.7, "completions/min_terminated_length": 16.7, "epoch": 0.3595984598459846, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.28299700319767, "learning_rate": 4.042587326870851e-06, "loss": 0.0001, "num_tokens": 7495776.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5230 }, { "completion_length": 22.2, "completions/clipped_ratio": 0.0, "completions/max_length": 22.2, "completions/max_terminated_length": 22.2, "completions/mean_length": 20.35, "completions/mean_terminated_length": 20.35, "completions/min_length": 18.9, "completions/min_terminated_length": 18.9, "epoch": 0.3602860286028603, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4640183687210082, "learning_rate": 4.037860923832913e-06, "loss": 0.0001, "num_tokens": 7511746.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5240 }, { "completion_length": 19.1, "completions/clipped_ratio": 0.0, "completions/max_length": 19.1, "completions/max_terminated_length": 19.1, "completions/mean_length": 16.8, "completions/mean_terminated_length": 16.8, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.360973597359736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.186241403222084, "learning_rate": 4.0331256614081735e-06, "loss": 0.0, "num_tokens": 7525094.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5250 }, { "completion_length": 18.08955223880597, "completions/clipped_ratio": 0.0, "completions/max_length": 18.11764705882353, "completions/max_terminated_length": 18.11764705882353, "completions/mean_length": 16.794117647058822, "completions/mean_terminated_length": 16.794117647058822, "completions/min_length": 15.411764705882353, "completions/min_terminated_length": 15.411764705882353, "epoch": 0.36166116611661164, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0812231904979963, "learning_rate": 4.028381566875773e-06, "loss": 0.0, "num_tokens": 7540044.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5260 }, { "completion_length": 20.15, "completions/clipped_ratio": 0.0, "completions/max_length": 20.1, "completions/max_terminated_length": 20.1, "completions/mean_length": 18.375, "completions/mean_terminated_length": 18.375, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.36234873487348734, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9661604385823012, "learning_rate": 4.02362866756573e-06, "loss": 0.0, "num_tokens": 7554587.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5270 }, { "completion_length": 16.775, "completions/clipped_ratio": 0.0, "completions/max_length": 16.7, "completions/max_terminated_length": 16.7, "completions/mean_length": 16.075, "completions/mean_terminated_length": 16.075, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.36303630363036304, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3903781726956368, "learning_rate": 4.018866990858785e-06, "loss": 0.0, "num_tokens": 7569714.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5280 }, { "completion_length": 19.675, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 18.15, "completions/mean_terminated_length": 18.15, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.36372387238723874, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3349122866988181, "learning_rate": 4.014096564186248e-06, "loss": 0.0, "num_tokens": 7582848.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5290 }, { "completion_length": 19.425, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 17.575, "completions/mean_terminated_length": 17.575, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.3644114411441144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8336154259741306, "learning_rate": 4.009317415029832e-06, "loss": 0.0, "num_tokens": 7597619.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5300 }, { "completion_length": 18.65, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 17.85, "completions/mean_terminated_length": 17.85, "completions/min_length": 17.1, "completions/min_terminated_length": 17.1, "epoch": 0.3650990099009901, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0214567624032498, "learning_rate": 4.004529570921501e-06, "loss": 0.0, "num_tokens": 7612549.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5310 }, { "completion_length": 17.975, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 17.15, "completions/mean_terminated_length": 17.15, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.3657865786578658, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2231212853454054, "learning_rate": 3.99973305944331e-06, "loss": 0.0, "num_tokens": 7627539.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5320 }, { "completion_length": 17.775, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 17.175, "completions/mean_terminated_length": 17.175, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.3664741474147415, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.197108805179596, "learning_rate": 3.9949279082272425e-06, "loss": 0.0, "num_tokens": 7643738.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5330 }, { "completion_length": 17.375, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 15.925, "completions/mean_terminated_length": 15.925, "completions/min_length": 13.9, "completions/min_terminated_length": 13.9, "epoch": 0.36716171617161714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1159055039286614, "learning_rate": 3.9901141449550565e-06, "loss": 0.0, "num_tokens": 7658551.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5340 }, { "completion_length": 17.075, "completions/clipped_ratio": 0.0, "completions/max_length": 16.9, "completions/max_terminated_length": 16.9, "completions/mean_length": 16.075, "completions/mean_terminated_length": 16.075, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.36784928492849284, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0155922904610635, "learning_rate": 3.985291797358123e-06, "loss": 0.0, "num_tokens": 7671674.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5350 }, { "completion_length": 16.825, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.575, "completions/mean_terminated_length": 16.575, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.36853685368536854, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2678054243326187, "learning_rate": 3.980460893217265e-06, "loss": 0.0, "num_tokens": 7684565.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5360 }, { "completion_length": 17.975, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.36922442244224424, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2318198367953301, "learning_rate": 3.9756214603626e-06, "loss": 0.0, "num_tokens": 7698909.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5370 }, { "completion_length": 18.95, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.2, "completions/mean_terminated_length": 17.2, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.36991199119911994, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3333981722593307, "learning_rate": 3.9707735266733735e-06, "loss": 0.0, "num_tokens": 7715133.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5380 }, { "completion_length": 17.425, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 16.8, "completions/mean_terminated_length": 16.8, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.3705995599559956, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3391637369990348, "learning_rate": 3.965917120077811e-06, "loss": 0.0, "num_tokens": 7727317.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5390 }, { "completion_length": 19.15, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.925, "completions/mean_terminated_length": 16.925, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.3712871287128713, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1109920389950276, "learning_rate": 3.961052268552941e-06, "loss": 0.0, "num_tokens": 7743642.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5400 }, { "completion_length": 16.95, "completions/clipped_ratio": 0.0, "completions/max_length": 16.9, "completions/max_terminated_length": 16.9, "completions/mean_length": 15.875, "completions/mean_terminated_length": 15.875, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.371974697469747, "frac_reward_zero_std": 1.0, "grad_norm": 5.6203894928330556e-05, "kl": 1.363871442526579, "learning_rate": 3.956179000124447e-06, "loss": 0.0, "num_tokens": 7758365.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5410 }, { "completion_length": 17.825, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 16.15, "completions/mean_terminated_length": 16.15, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.3726622662266227, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9931762866675854, "learning_rate": 3.9512973428665e-06, "loss": 0.0, "num_tokens": 7772323.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5420 }, { "completion_length": 21.425, "completions/clipped_ratio": 0.0, "completions/max_length": 21.6, "completions/max_terminated_length": 21.6, "completions/mean_length": 18.825, "completions/mean_terminated_length": 18.825, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.37334983498349833, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0012955855578185, "learning_rate": 3.946407324901598e-06, "loss": 0.0, "num_tokens": 7785692.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5430 }, { "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.2, "completions/max_terminated_length": 21.2, "completions/mean_length": 17.95, "completions/mean_terminated_length": 17.95, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.37403740374037403, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0526311319321393, "learning_rate": 3.941508974400401e-06, "loss": 0.0, "num_tokens": 7802662.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5440 }, { "completion_length": 17.1, "completions/clipped_ratio": 0.0, "completions/max_length": 16.9, "completions/max_terminated_length": 16.9, "completions/mean_length": 16.575, "completions/mean_terminated_length": 16.575, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.37472497249724973, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1798742283135653, "learning_rate": 3.9366023195815755e-06, "loss": 0.0, "num_tokens": 7817133.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5450 }, { "completion_length": 19.2, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 18.225, "completions/mean_terminated_length": 18.225, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.37541254125412543, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3727002948522569, "learning_rate": 3.931687388711626e-06, "loss": 0.0001, "num_tokens": 7833654.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5460 }, { "completion_length": 20.15, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.3, "completions/mean_terminated_length": 17.3, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.3761001100110011, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0240365587174893, "learning_rate": 3.926764210104733e-06, "loss": 0.0, "num_tokens": 7851086.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5470 }, { "completion_length": 19.075, "completions/clipped_ratio": 0.0, "completions/max_length": 19.2, "completions/max_terminated_length": 19.2, "completions/mean_length": 17.2, "completions/mean_terminated_length": 17.2, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.3767876787678768, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1240653157234193, "learning_rate": 3.921832812122593e-06, "loss": 0.0, "num_tokens": 7867270.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5480 }, { "completion_length": 17.975, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 16.55, "completions/mean_terminated_length": 16.55, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.3774752475247525, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4767700091004372, "learning_rate": 3.916893223174254e-06, "loss": 0.0001, "num_tokens": 7882340.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5490 }, { "completion_length": 18.725, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 16.35, "completions/mean_terminated_length": 16.35, "completions/min_length": 14.8, "completions/min_terminated_length": 14.8, "epoch": 0.3781628162816282, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.476631324738264, "learning_rate": 3.911945471715947e-06, "loss": 0.0001, "num_tokens": 7897518.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5500 }, { "completion_length": 19.825, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 18.3, "completions/mean_terminated_length": 18.3, "completions/min_length": 16.8, "completions/min_terminated_length": 16.8, "epoch": 0.3788503850385038, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1571273379027844, "learning_rate": 3.906989586250928e-06, "loss": 0.0, "num_tokens": 7911386.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5510 }, { "completion_length": 17.3, "completions/clipped_ratio": 0.0, "completions/max_length": 17.3, "completions/max_terminated_length": 17.3, "completions/mean_length": 15.575, "completions/mean_terminated_length": 15.575, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.3795379537953795, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3854421511292458, "learning_rate": 3.902025595329314e-06, "loss": 0.0, "num_tokens": 7923165.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5520 }, { "completion_length": 18.2, "completions/clipped_ratio": 0.0, "completions/max_length": 18.2, "completions/max_terminated_length": 18.2, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.3802255225522552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2270353332161903, "learning_rate": 3.897053527547912e-06, "loss": 0.0, "num_tokens": 7937471.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5530 }, { "completion_length": 17.6, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.3809130913091309, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0666535507887602, "learning_rate": 3.892073411550062e-06, "loss": 0.0, "num_tokens": 7951813.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5540 }, { "completion_length": 17.7, "completions/clipped_ratio": 0.0, "completions/max_length": 17.9, "completions/max_terminated_length": 17.9, "completions/mean_length": 16.9, "completions/mean_terminated_length": 16.9, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.3816006600660066, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2041775345802308, "learning_rate": 3.887085276025469e-06, "loss": 0.0, "num_tokens": 7968181.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5550 }, { "completion_length": 16.7, "completions/clipped_ratio": 0.0, "completions/max_length": 16.4, "completions/max_terminated_length": 16.4, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.8, "completions/min_terminated_length": 14.8, "epoch": 0.38228822882288227, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1235090486705304, "learning_rate": 3.882089149710035e-06, "loss": 0.0, "num_tokens": 7984055.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5560 }, { "completion_length": 18.725, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 17.3, "completions/mean_terminated_length": 17.3, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.38297579757975797, "frac_reward_zero_std": 1.0, "grad_norm": 2.725888043642044e-05, "kl": 1.1920234143733979, "learning_rate": 3.877085061385694e-06, "loss": 0.0, "num_tokens": 7997675.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5570 }, { "completion_length": 18.275, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 16.6, "completions/mean_terminated_length": 16.6, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.38366336633663367, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9494880434125662, "learning_rate": 3.872073039880254e-06, "loss": 0.0, "num_tokens": 8011851.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5580 }, { "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.2, "completions/max_terminated_length": 23.2, "completions/mean_length": 19.575, "completions/mean_terminated_length": 19.575, "completions/min_length": 16.7, "completions/min_terminated_length": 16.7, "epoch": 0.38435093509350937, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0359878040850163, "learning_rate": 3.8670531140672194e-06, "loss": 0.0, "num_tokens": 8024570.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5590 }, { "completion_length": 18.3, "completions/clipped_ratio": 0.0, "completions/max_length": 18.3, "completions/max_terminated_length": 18.3, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.385038503850385, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4615533858537675, "learning_rate": 3.862025312865633e-06, "loss": 0.0001, "num_tokens": 8039680.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5600 }, { "completion_length": 17.725, "completions/clipped_ratio": 0.0, "completions/max_length": 17.6, "completions/max_terminated_length": 17.6, "completions/mean_length": 16.6, "completions/mean_terminated_length": 16.6, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.3857260726072607, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1007069438695907, "learning_rate": 3.856989665239904e-06, "loss": 0.0, "num_tokens": 8054900.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5610 }, { "completion_length": 19.275, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 18.075, "completions/mean_terminated_length": 18.075, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.3864136413641364, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1791205305606127, "learning_rate": 3.851946200199648e-06, "loss": 0.0, "num_tokens": 8070555.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5620 }, { "completion_length": 19.1, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 16.925, "completions/mean_terminated_length": 16.925, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.3871012101210121, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1043142512440682, "learning_rate": 3.846894946799511e-06, "loss": 0.0, "num_tokens": 8083116.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5630 }, { "completion_length": 17.65, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 16.025, "completions/mean_terminated_length": 16.025, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.38778877887788776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4466410249471664, "learning_rate": 3.841835934139008e-06, "loss": 0.0, "num_tokens": 8097373.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5640 }, { "completion_length": 17.175, "completions/clipped_ratio": 0.0, "completions/max_length": 17.1, "completions/max_terminated_length": 17.1, "completions/mean_length": 15.975, "completions/mean_terminated_length": 15.975, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.38847634763476346, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3932079687714576, "learning_rate": 3.8367691913623565e-06, "loss": 0.0, "num_tokens": 8108796.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5650 }, { "completion_length": 20.275, "completions/clipped_ratio": 0.0, "completions/max_length": 20.4, "completions/max_terminated_length": 20.4, "completions/mean_length": 17.525, "completions/mean_terminated_length": 17.525, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.38916391639163916, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1165172673761845, "learning_rate": 3.831694747658301e-06, "loss": 0.0, "num_tokens": 8123245.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5660 }, { "completion_length": 17.175, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.2, "completions/mean_terminated_length": 16.2, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.38985148514851486, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.037246273458004, "learning_rate": 3.826612632259955e-06, "loss": 0.0, "num_tokens": 8137105.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5670 }, { "completion_length": 19.9, "completions/clipped_ratio": 0.0, "completions/max_length": 19.9, "completions/max_terminated_length": 19.9, "completions/mean_length": 17.65, "completions/mean_terminated_length": 17.65, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.39053905390539057, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.185601119697094, "learning_rate": 3.821522874444626e-06, "loss": 0.0, "num_tokens": 8151835.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5680 }, { "completion_length": 20.15, "completions/clipped_ratio": 0.0, "completions/max_length": 20.4, "completions/max_terminated_length": 20.4, "completions/mean_length": 18.2, "completions/mean_terminated_length": 18.2, "completions/min_length": 16.7, "completions/min_terminated_length": 16.7, "epoch": 0.3912266226622662, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9930311039090156, "learning_rate": 3.8164255035336454e-06, "loss": 0.0, "num_tokens": 8165839.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5690 }, { "completion_length": 18.85, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 16.775, "completions/mean_terminated_length": 16.775, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.3919141914191419, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4449263490736484, "learning_rate": 3.811320548892205e-06, "loss": 0.0001, "num_tokens": 8177630.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5700 }, { "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.5, "completions/max_terminated_length": 17.5, "completions/mean_length": 16.775, "completions/mean_terminated_length": 16.775, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.3926017601760176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1857761025428772, "learning_rate": 3.8062080399291872e-06, "loss": 0.0, "num_tokens": 8192361.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5710 }, { "completion_length": 20.1, "completions/clipped_ratio": 0.0, "completions/max_length": 20.1, "completions/max_terminated_length": 20.1, "completions/mean_length": 18.975, "completions/mean_terminated_length": 18.975, "completions/min_length": 17.4, "completions/min_terminated_length": 17.4, "epoch": 0.3932893289328933, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.346421904861927, "learning_rate": 3.801088006096989e-06, "loss": 0.0001, "num_tokens": 8204804.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5720 }, { "completion_length": 16.7, "completions/clipped_ratio": 0.0, "completions/max_length": 16.7, "completions/max_terminated_length": 16.7, "completions/mean_length": 15.875, "completions/mean_terminated_length": 15.875, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.39397689768976896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1227647330611945, "learning_rate": 3.7959604768913615e-06, "loss": 0.0, "num_tokens": 8220067.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5730 }, { "completion_length": 18.1, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 17.1, "completions/mean_terminated_length": 17.1, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.39466446644664466, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3379293769598006, "learning_rate": 3.7908254818512323e-06, "loss": 0.0, "num_tokens": 8235871.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5740 }, { "completion_length": 20.6, "completions/clipped_ratio": 0.0, "completions/max_length": 20.6, "completions/max_terminated_length": 20.6, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 16.7, "completions/min_terminated_length": 16.7, "epoch": 0.39535203520352036, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1466250203549861, "learning_rate": 3.785683050558541e-06, "loss": 0.0, "num_tokens": 8249645.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5750 }, { "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.5, "completions/max_terminated_length": 15.5, "completions/mean_length": 15.05, "completions/mean_terminated_length": 15.05, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.39603960396039606, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3106171108782292, "learning_rate": 3.7805332126380647e-06, "loss": 0.0, "num_tokens": 8262587.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5760 }, { "completion_length": 20.3, "completions/clipped_ratio": 0.0, "completions/max_length": 20.3, "completions/max_terminated_length": 20.3, "completions/mean_length": 17.925, "completions/mean_terminated_length": 17.925, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.3967271727172717, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1861035495996475, "learning_rate": 3.775375997757249e-06, "loss": 0.0, "num_tokens": 8276160.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5770 }, { "completion_length": 19.8, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 17.475, "completions/mean_terminated_length": 17.475, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.3974147414741474, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0805307626724243, "learning_rate": 3.7702114356260387e-06, "loss": 0.0, "num_tokens": 8290663.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5780 }, { "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.075, "completions/mean_terminated_length": 16.075, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.3981023102310231, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2187039345502853, "learning_rate": 3.7650395559967036e-06, "loss": 0.0, "num_tokens": 8301238.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5790 }, { "completion_length": 18.1, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 17.05, "completions/mean_terminated_length": 17.05, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.3987898789878988, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1940217852592467, "learning_rate": 3.759860388663668e-06, "loss": 0.0, "num_tokens": 8313336.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5800 }, { "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.5, "completions/max_terminated_length": 17.5, "completions/mean_length": 17.025, "completions/mean_terminated_length": 17.025, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.39947744774477445, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0611320044845343, "learning_rate": 3.754673963463341e-06, "loss": 0.0, "num_tokens": 8327733.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5810 }, { "completion_length": 18.9, "completions/clipped_ratio": 0.0, "completions/max_length": 18.9, "completions/max_terminated_length": 18.9, "completions/mean_length": 17.125, "completions/mean_terminated_length": 17.125, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.40016501650165015, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1921575225889682, "learning_rate": 3.749480310273943e-06, "loss": 0.0, "num_tokens": 8341750.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5820 }, { "completion_length": 17.3, "completions/clipped_ratio": 0.0, "completions/max_length": 17.3, "completions/max_terminated_length": 17.3, "completions/mean_length": 16.35, "completions/mean_terminated_length": 16.35, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.40085258525852585, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9138251326978206, "learning_rate": 3.7442794590153326e-06, "loss": 0.0, "num_tokens": 8356848.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5830 }, { "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.2, "completions/mean_terminated_length": 18.2, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.40154015401540155, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1536221474409103, "learning_rate": 3.739071439648836e-06, "loss": 0.0, "num_tokens": 8372328.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5840 }, { "completion_length": 19.8, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 16.325, "completions/mean_terminated_length": 16.325, "completions/min_length": 14.6, "completions/min_terminated_length": 14.6, "epoch": 0.40222772277227725, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0150370292365551, "learning_rate": 3.733856282177074e-06, "loss": 0.0, "num_tokens": 8387829.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5850 }, { "completion_length": 19.6, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.4029152915291529, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3025204107165336, "learning_rate": 3.7286340166437907e-06, "loss": 0.0, "num_tokens": 8402069.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5860 }, { "completion_length": 20.2, "completions/clipped_ratio": 0.0, "completions/max_length": 20.2, "completions/max_terminated_length": 20.2, "completions/mean_length": 17.9, "completions/mean_terminated_length": 17.9, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.4036028602860286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1302866250276566, "learning_rate": 3.723404673133674e-06, "loss": 0.0, "num_tokens": 8416929.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5870 }, { "completion_length": 19.3, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 17.15, "completions/mean_terminated_length": 17.15, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.4042904290429043, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0556719139218331, "learning_rate": 3.7181682817721915e-06, "loss": 0.0, "num_tokens": 8433219.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5880 }, { "completion_length": 20.4, "completions/clipped_ratio": 0.0, "completions/max_length": 20.4, "completions/max_terminated_length": 20.4, "completions/mean_length": 17.95, "completions/mean_terminated_length": 17.95, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.40497799779978, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1209779269993305, "learning_rate": 3.712924872725411e-06, "loss": 0.0, "num_tokens": 8448301.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5890 }, { "completion_length": 19.9, "completions/clipped_ratio": 0.0, "completions/max_length": 19.9, "completions/max_terminated_length": 19.9, "completions/mean_length": 17.45, "completions/mean_terminated_length": 17.45, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.40566556655665564, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1055759094655513, "learning_rate": 3.7076744761998268e-06, "loss": 0.0, "num_tokens": 8461651.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5900 }, { "completion_length": 18.3, "completions/clipped_ratio": 0.0, "completions/max_length": 18.3, "completions/max_terminated_length": 18.3, "completions/mean_length": 16.625, "completions/mean_terminated_length": 16.625, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.40635313531353134, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0736303746700286, "learning_rate": 3.7024171224421884e-06, "loss": 0.0, "num_tokens": 8475424.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5910 }, { "completion_length": 17.2, "completions/clipped_ratio": 0.0, "completions/max_length": 17.2, "completions/max_terminated_length": 17.2, "completions/mean_length": 16.425, "completions/mean_terminated_length": 16.425, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.40704070407040704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2711664289236069, "learning_rate": 3.6971528417393254e-06, "loss": 0.0, "num_tokens": 8490933.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5920 }, { "completion_length": 21.2, "completions/clipped_ratio": 0.0, "completions/max_length": 21.2, "completions/max_terminated_length": 21.2, "completions/mean_length": 18.575, "completions/mean_terminated_length": 18.575, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.40772827282728275, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9545292537659407, "learning_rate": 3.6918816644179707e-06, "loss": 0.0, "num_tokens": 8504496.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5930 }, { "completion_length": 17.7, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 16.425, "completions/mean_terminated_length": 16.425, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.4084158415841584, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1604718565940857, "learning_rate": 3.686603620844589e-06, "loss": 0.0, "num_tokens": 8517765.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5940 }, { "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.5, "completions/max_terminated_length": 17.5, "completions/mean_length": 16.55, "completions/mean_terminated_length": 16.55, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.4091034103410341, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.387231619283557, "learning_rate": 3.6813187414252e-06, "loss": 0.0, "num_tokens": 8530935.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5950 }, { "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.7, "completions/mean_terminated_length": 16.7, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.4097909790979098, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3967902317643166, "learning_rate": 3.6760270566052037e-06, "loss": 0.0, "num_tokens": 8544803.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5960 }, { "completion_length": 17.8, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 16.275, "completions/mean_terminated_length": 16.275, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.4104785478547855, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1969308275729418, "learning_rate": 3.670728596869205e-06, "loss": 0.0, "num_tokens": 8558642.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5970 }, { "completion_length": 17.8, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 15.625, "completions/mean_terminated_length": 15.625, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.4111661166116612, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0570856800302864, "learning_rate": 3.6654233927408377e-06, "loss": 0.0, "num_tokens": 8572351.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5980 }, { "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.35, "completions/mean_terminated_length": 17.35, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.41185368536853684, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3443511426448822, "learning_rate": 3.66011147478259e-06, "loss": 0.0, "num_tokens": 8588401.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 5990 }, { "completion_length": 17.9, "completions/clipped_ratio": 0.0, "completions/max_length": 17.9, "completions/max_terminated_length": 17.9, "completions/mean_length": 16.775, "completions/mean_terminated_length": 16.775, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.41254125412541254, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.286434081196785, "learning_rate": 3.654792873595627e-06, "loss": 0.0, "num_tokens": 8604144.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6000 }, { "completion_length": 20.267857142857142, "completions/clipped_ratio": 0.0, "completions/max_length": 20.266666666666666, "completions/max_terminated_length": 20.266666666666666, "completions/mean_length": 17.966666666666665, "completions/mean_terminated_length": 17.966666666666665, "completions/min_length": 16.266666666666666, "completions/min_terminated_length": 16.266666666666666, "epoch": 0.41322882288228824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9310508279928139, "learning_rate": 3.649467619819613e-06, "loss": 0.0, "num_tokens": 8616868.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6010 }, { "completion_length": 18.375, "completions/clipped_ratio": 0.0, "completions/max_length": 18.9, "completions/max_terminated_length": 18.9, "completions/mean_length": 17.525, "completions/mean_terminated_length": 17.525, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.41391639163916394, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3302444115281105, "learning_rate": 3.6441357441325416e-06, "loss": 0.0, "num_tokens": 8630089.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6020 }, { "completion_length": 19.05, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.4146039603960396, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.272117492184043, "learning_rate": 3.6387972772505493e-06, "loss": 0.0, "num_tokens": 8643569.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6030 }, { "completion_length": 20.55, "completions/clipped_ratio": 0.0, "completions/max_length": 21.3, "completions/max_terminated_length": 21.3, "completions/mean_length": 18.6, "completions/mean_terminated_length": 18.6, "completions/min_length": 17.1, "completions/min_terminated_length": 17.1, "epoch": 0.4152915291529153, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9967441722750664, "learning_rate": 3.6334522499277454e-06, "loss": 0.0, "num_tokens": 8659189.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6040 }, { "completion_length": 19.9, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 17.7, "completions/mean_terminated_length": 17.7, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.415979097909791, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.178647445142269, "learning_rate": 3.628100692956034e-06, "loss": 0.0, "num_tokens": 8674121.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6050 }, { "completion_length": 18.675, "completions/clipped_ratio": 0.0, "completions/max_length": 18.9, "completions/max_terminated_length": 18.9, "completions/mean_length": 17.15, "completions/mean_terminated_length": 17.15, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.4166666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1533052645623685, "learning_rate": 3.6227426371649334e-06, "loss": 0.0, "num_tokens": 8686443.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6060 }, { "completion_length": 19.375, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 18.1, "completions/mean_terminated_length": 18.1, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.41735423542354233, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1977208808064461, "learning_rate": 3.617378113421402e-06, "loss": 0.0, "num_tokens": 8700651.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6070 }, { "completion_length": 21.1, "completions/clipped_ratio": 0.0, "completions/max_length": 20.2, "completions/max_terminated_length": 20.2, "completions/mean_length": 18.1, "completions/mean_terminated_length": 18.1, "completions/min_length": 17.2, "completions/min_terminated_length": 17.2, "epoch": 0.41804180418041803, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0092668190598488, "learning_rate": 3.6120071526296597e-06, "loss": 0.0, "num_tokens": 8714863.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6080 }, { "completion_length": 20.1, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.825, "completions/mean_terminated_length": 18.825, "completions/min_length": 16.9, "completions/min_terminated_length": 16.9, "epoch": 0.41872937293729373, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9233269207179546, "learning_rate": 3.6066297857310075e-06, "loss": 0.0, "num_tokens": 8728856.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6090 }, { "completion_length": 19.15, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 17.3, "completions/mean_terminated_length": 17.3, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.41941694169416943, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1651827566325665, "learning_rate": 3.6012460437036525e-06, "loss": 0.0, "num_tokens": 8743392.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6100 }, { "completion_length": 16.4, "completions/clipped_ratio": 0.0, "completions/max_length": 16.4, "completions/max_terminated_length": 16.4, "completions/mean_length": 15.625, "completions/mean_terminated_length": 15.625, "completions/min_length": 14.8, "completions/min_terminated_length": 14.8, "epoch": 0.42010451045104513, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2832200959324838, "learning_rate": 3.595855957562527e-06, "loss": 0.0, "num_tokens": 8757293.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6110 }, { "completion_length": 18.475, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 17.375, "completions/mean_terminated_length": 17.375, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.4207920792079208, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2778358906507492, "learning_rate": 3.5904595583591113e-06, "loss": 0.0, "num_tokens": 8772660.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6120 }, { "completion_length": 18.025, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.4214796479647965, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0669266467913985, "learning_rate": 3.5850568771812544e-06, "loss": 0.0, "num_tokens": 8788930.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6130 }, { "completion_length": 17.4, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 15.9, "completions/mean_terminated_length": 15.9, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.4221672167216722, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4181117609143257, "learning_rate": 3.579647945152994e-06, "loss": 0.0, "num_tokens": 8802998.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6140 }, { "completion_length": 19.95, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 17.975, "completions/mean_terminated_length": 17.975, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.4228547854785479, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.283132864534855, "learning_rate": 3.574232793434379e-06, "loss": 0.0, "num_tokens": 8819749.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6150 }, { "completion_length": 18.625, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.4235423542354235, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.121612760424614, "learning_rate": 3.56881145322129e-06, "loss": 0.0, "num_tokens": 8832619.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6160 }, { "completion_length": 17.6, "completions/clipped_ratio": 0.0, "completions/max_length": 17.6, "completions/max_terminated_length": 17.6, "completions/mean_length": 16.825, "completions/mean_terminated_length": 16.825, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.4242299229922992, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1134303838014603, "learning_rate": 3.5633839557452543e-06, "loss": 0.0, "num_tokens": 8847872.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6170 }, { "completion_length": 19.425, "completions/clipped_ratio": 0.0, "completions/max_length": 19.2, "completions/max_terminated_length": 19.2, "completions/mean_length": 18.55, "completions/mean_terminated_length": 18.55, "completions/min_length": 17.3, "completions/min_terminated_length": 17.3, "epoch": 0.4249174917491749, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1435566797852517, "learning_rate": 3.557950332273276e-06, "loss": 0.0, "num_tokens": 8862446.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6180 }, { "completion_length": 17.65, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.4256050605060506, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3846249349415303, "learning_rate": 3.552510614107646e-06, "loss": 0.0, "num_tokens": 8876650.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6190 }, { "completion_length": 20.075, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.2, "completions/mean_terminated_length": 18.2, "completions/min_length": 16.8, "completions/min_terminated_length": 16.8, "epoch": 0.42629262926292627, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2280396707355976, "learning_rate": 3.5470648325857667e-06, "loss": 0.0, "num_tokens": 8890522.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6200 }, { "completion_length": 21.525, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.8, "completions/mean_terminated_length": 18.8, "completions/min_length": 16.8, "completions/min_terminated_length": 16.8, "epoch": 0.42698019801980197, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0913087368011474, "learning_rate": 3.5416130190799726e-06, "loss": 0.0, "num_tokens": 8903314.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6210 }, { "completion_length": 18.4, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 17.025, "completions/mean_terminated_length": 17.025, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.42766776677667767, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0859883170574904, "learning_rate": 3.5361552049973443e-06, "loss": 0.0, "num_tokens": 8916443.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6220 }, { "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.6, "completions/mean_terminated_length": 16.6, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.42835533553355337, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3771632492542267, "learning_rate": 3.530691421779533e-06, "loss": 0.0, "num_tokens": 8931255.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6230 }, { "completion_length": 17.7, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 16.2, "completions/mean_terminated_length": 16.2, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.429042904290429, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.033303615450859, "learning_rate": 3.5252217009025785e-06, "loss": 0.0, "num_tokens": 8945919.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6240 }, { "completion_length": 19.125, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 16.375, "completions/mean_terminated_length": 16.375, "completions/min_length": 14.6, "completions/min_terminated_length": 14.6, "epoch": 0.4297304730473047, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9652321547269821, "learning_rate": 3.519746073876722e-06, "loss": 0.0, "num_tokens": 8960426.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6250 }, { "completion_length": 19.35, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 17.725, "completions/mean_terminated_length": 17.725, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.4304180418041804, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1177176356315612, "learning_rate": 3.5142645722462344e-06, "loss": 0.0, "num_tokens": 8976251.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6260 }, { "completion_length": 18.275, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 16.45, "completions/mean_terminated_length": 16.45, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.4311056105610561, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0503662951290607, "learning_rate": 3.508777227589225e-06, "loss": 0.0, "num_tokens": 8991429.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6270 }, { "completion_length": 18.55, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 17.45, "completions/mean_terminated_length": 17.45, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.4317931793179318, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2404997482895852, "learning_rate": 3.5032840715174667e-06, "loss": 0.0, "num_tokens": 9005219.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6280 }, { "completion_length": 17.825, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 17.9, "completions/mean_terminated_length": 17.9, "completions/min_length": 16.8, "completions/min_terminated_length": 16.8, "epoch": 0.43248074807480746, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2390207281336187, "learning_rate": 3.49778513567621e-06, "loss": 0.0, "num_tokens": 9021007.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6290 }, { "completion_length": 20.3, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 17.55, "completions/mean_terminated_length": 17.55, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.43316831683168316, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1214835539460182, "learning_rate": 3.492280451744002e-06, "loss": 0.0, "num_tokens": 9037529.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6300 }, { "completion_length": 18.7, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.95, "completions/mean_terminated_length": 16.95, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.43385588558855886, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2757804594933986, "learning_rate": 3.486770051432503e-06, "loss": 0.0, "num_tokens": 9053607.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6310 }, { "completion_length": 21.4, "completions/clipped_ratio": 0.0, "completions/max_length": 21.4, "completions/max_terminated_length": 21.4, "completions/mean_length": 18.275, "completions/mean_terminated_length": 18.275, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.43454345434543457, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1102384865283965, "learning_rate": 3.4812539664863054e-06, "loss": 0.0, "num_tokens": 9068402.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6320 }, { "completion_length": 15.925, "completions/clipped_ratio": 0.0, "completions/max_length": 15.4, "completions/max_terminated_length": 15.4, "completions/mean_length": 15.15, "completions/mean_terminated_length": 15.15, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.4352310231023102, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.142233519256115, "learning_rate": 3.475732228682751e-06, "loss": 0.0, "num_tokens": 9083336.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6330 }, { "completion_length": 16.3, "completions/clipped_ratio": 0.0, "completions/max_length": 16.3, "completions/max_terminated_length": 16.3, "completions/mean_length": 15.7, "completions/mean_terminated_length": 15.7, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.4359185918591859, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.011228272691369, "learning_rate": 3.470204869831744e-06, "loss": 0.0, "num_tokens": 9096784.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6340 }, { "completion_length": 17.225, "completions/clipped_ratio": 0.0, "completions/max_length": 17.6, "completions/max_terminated_length": 17.6, "completions/mean_length": 16.95, "completions/mean_terminated_length": 16.95, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.4366061606160616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.434160264581442, "learning_rate": 3.464671921775572e-06, "loss": 0.0, "num_tokens": 9112334.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6350 }, { "completion_length": 20.125, "completions/clipped_ratio": 0.0, "completions/max_length": 20.2, "completions/max_terminated_length": 20.2, "completions/mean_length": 17.95, "completions/mean_terminated_length": 17.95, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.4372937293729373, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1707235604524613, "learning_rate": 3.4591334163887226e-06, "loss": 0.0, "num_tokens": 9131608.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6360 }, { "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.9, "completions/max_terminated_length": 18.9, "completions/mean_length": 17.05, "completions/mean_terminated_length": 17.05, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.43798129812981296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.305480483174324, "learning_rate": 3.4535893855776966e-06, "loss": 0.0, "num_tokens": 9148066.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6370 }, { "completion_length": 19.275, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 17.65, "completions/mean_terminated_length": 17.65, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.43866886688668866, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.256833466887474, "learning_rate": 3.4480398612808252e-06, "loss": 0.0, "num_tokens": 9162104.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6380 }, { "completion_length": 18.675, "completions/clipped_ratio": 0.0, "completions/max_length": 18.3, "completions/max_terminated_length": 18.3, "completions/mean_length": 17.05, "completions/mean_terminated_length": 17.05, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.43935643564356436, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2208698764443398, "learning_rate": 3.4424848754680894e-06, "loss": 0.0, "num_tokens": 9177914.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6390 }, { "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 17.025, "completions/mean_terminated_length": 17.025, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.44004400440044006, "frac_reward_zero_std": 1.0, "grad_norm": 2.4169297830667347e-05, "kl": 1.2537176951766014, "learning_rate": 3.4369244601409318e-06, "loss": 0.0, "num_tokens": 9189927.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6400 }, { "completion_length": 19.95, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 17.7, "completions/mean_terminated_length": 17.7, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.44073157315731576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0676696162670851, "learning_rate": 3.431358647332072e-06, "loss": 0.0, "num_tokens": 9204375.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6410 }, { "completion_length": 18.375, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 16.225, "completions/mean_terminated_length": 16.225, "completions/min_length": 14.6, "completions/min_terminated_length": 14.6, "epoch": 0.4414191419141914, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1018186882138252, "learning_rate": 3.4257874691053273e-06, "loss": 0.0, "num_tokens": 9221860.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6420 }, { "completion_length": 20.7, "completions/clipped_ratio": 0.0, "completions/max_length": 20.4, "completions/max_terminated_length": 20.4, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.4421067106710671, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2281114704906941, "learning_rate": 3.420210957555421e-06, "loss": 0.0, "num_tokens": 9236574.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6430 }, { "completion_length": 18.875, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 16.8, "completions/mean_terminated_length": 16.8, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.4427942794279428, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2437041610479356, "learning_rate": 3.4146291448078023e-06, "loss": 0.0, "num_tokens": 9249406.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6440 }, { "completion_length": 17.65, "completions/clipped_ratio": 0.0, "completions/max_length": 17.2, "completions/max_terminated_length": 17.2, "completions/mean_length": 16.125, "completions/mean_terminated_length": 16.125, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.4434818481848185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.356383590400219, "learning_rate": 3.40904206301846e-06, "loss": 0.0, "num_tokens": 9263727.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6450 }, { "completion_length": 17.4, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 16.075, "completions/mean_terminated_length": 16.075, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.44416941694169415, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.187030690908432, "learning_rate": 3.4034497443737367e-06, "loss": 0.0, "num_tokens": 9276570.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6460 }, { "completion_length": 18.225, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 17.075, "completions/mean_terminated_length": 17.075, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.44485698569856985, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3778787449002265, "learning_rate": 3.3978522210901437e-06, "loss": 0.0, "num_tokens": 9290833.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6470 }, { "completion_length": 18.225, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 16.6, "completions/mean_terminated_length": 16.6, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.44554455445544555, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2157479397952557, "learning_rate": 3.3922495254141753e-06, "loss": 0.0, "num_tokens": 9304141.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6480 }, { "completion_length": 16.8, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 15.65, "completions/mean_terminated_length": 15.65, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.44623212321232125, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.07399140894413, "learning_rate": 3.3866416896221243e-06, "loss": 0.0, "num_tokens": 9318759.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6490 }, { "completion_length": 19.55, "completions/clipped_ratio": 0.0, "completions/max_length": 19.7, "completions/max_terminated_length": 19.7, "completions/mean_length": 17.275, "completions/mean_terminated_length": 17.275, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.4469196919691969, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2683727070689201, "learning_rate": 3.381028746019893e-06, "loss": 0.0, "num_tokens": 9332878.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6500 }, { "completion_length": 18.15, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.65, "completions/mean_terminated_length": 16.65, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.4476072607260726, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2300020914524794, "learning_rate": 3.375410726942811e-06, "loss": 0.0, "num_tokens": 9348736.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6510 }, { "completion_length": 17.075, "completions/clipped_ratio": 0.0, "completions/max_length": 17.6, "completions/max_terminated_length": 17.6, "completions/mean_length": 15.875, "completions/mean_terminated_length": 15.875, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.4482948294829483, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3182392500340938, "learning_rate": 3.3697876647554454e-06, "loss": 0.0, "num_tokens": 9362927.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6520 }, { "completion_length": 18.675, "completions/clipped_ratio": 0.0, "completions/max_length": 18.3, "completions/max_terminated_length": 18.3, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.448982398239824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.493177343904972, "learning_rate": 3.3641595918514167e-06, "loss": 0.0001, "num_tokens": 9377459.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6530 }, { "completion_length": 19.075, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 18.15, "completions/mean_terminated_length": 18.15, "completions/min_length": 17.3, "completions/min_terminated_length": 17.3, "epoch": 0.44966996699669964, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.994390182197094, "learning_rate": 3.3585265406532113e-06, "loss": 0.0, "num_tokens": 9391769.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6540 }, { "completion_length": 19.925, "completions/clipped_ratio": 0.0, "completions/max_length": 19.7, "completions/max_terminated_length": 19.7, "completions/mean_length": 18.125, "completions/mean_terminated_length": 18.125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.45035753575357534, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0702966153621674, "learning_rate": 3.3528885436119963e-06, "loss": 0.0, "num_tokens": 9410642.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6550 }, { "completion_length": 18.9, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 17.075, "completions/mean_terminated_length": 17.075, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.45104510451045104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2841762840747832, "learning_rate": 3.347245633207426e-06, "loss": 0.0, "num_tokens": 9424217.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6560 }, { "completion_length": 18.175, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 16.3, "completions/mean_terminated_length": 16.3, "completions/min_length": 14.8, "completions/min_terminated_length": 14.8, "epoch": 0.45173267326732675, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.261767642199993, "learning_rate": 3.3415978419474652e-06, "loss": 0.0, "num_tokens": 9439521.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6570 }, { "completion_length": 17.7, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 15.975, "completions/mean_terminated_length": 15.975, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.45242024202420245, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3066105540841817, "learning_rate": 3.335945202368197e-06, "loss": 0.0, "num_tokens": 9454104.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6580 }, { "completion_length": 20.3, "completions/clipped_ratio": 0.0, "completions/max_length": 19.7, "completions/max_terminated_length": 19.7, "completions/mean_length": 17.85, "completions/mean_terminated_length": 17.85, "completions/min_length": 16.8, "completions/min_terminated_length": 16.8, "epoch": 0.4531078107810781, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0359044279903173, "learning_rate": 3.3302877470336287e-06, "loss": 0.0, "num_tokens": 9468898.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6590 }, { "completion_length": 18.9, "completions/clipped_ratio": 0.0, "completions/max_length": 18.9, "completions/max_terminated_length": 18.9, "completions/mean_length": 17.45, "completions/mean_terminated_length": 17.45, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.4537953795379538, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2344742968678475, "learning_rate": 3.3246255085355168e-06, "loss": 0.0, "num_tokens": 9482364.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6600 }, { "completion_length": 18.7, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.225, "completions/mean_terminated_length": 17.225, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.4544829482948295, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0627164557576179, "learning_rate": 3.3189585194931704e-06, "loss": 0.0, "num_tokens": 9496173.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6610 }, { "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.2, "completions/max_terminated_length": 18.2, "completions/mean_length": 16.975, "completions/mean_terminated_length": 16.975, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.4551705170517052, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0172219932079316, "learning_rate": 3.313286812553265e-06, "loss": 0.0, "num_tokens": 9510032.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6620 }, { "completion_length": 18.625, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 16.675, "completions/mean_terminated_length": 16.675, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.45585808580858084, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0635989099740981, "learning_rate": 3.3076104203896576e-06, "loss": 0.0, "num_tokens": 9527143.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6630 }, { "completion_length": 18.15, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 17.525, "completions/mean_terminated_length": 17.525, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.45654565456545654, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3625286146998405, "learning_rate": 3.301929375703193e-06, "loss": 0.0, "num_tokens": 9542252.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6640 }, { "completion_length": 18.025, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 17.125, "completions/mean_terminated_length": 17.125, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.45723322332233224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3592749863862992, "learning_rate": 3.29624371122152e-06, "loss": 0.0, "num_tokens": 9556897.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6650 }, { "completion_length": 19.025, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 17.2, "completions/mean_terminated_length": 17.2, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.45792079207920794, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.285938036441803, "learning_rate": 3.290553459698903e-06, "loss": 0.0, "num_tokens": 9571737.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6660 }, { "completion_length": 18.225, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 17.15, "completions/mean_terminated_length": 17.15, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.4586083608360836, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3218659296631814, "learning_rate": 3.284858653916029e-06, "loss": 0.0, "num_tokens": 9586187.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6670 }, { "completion_length": 17.975, "completions/clipped_ratio": 0.0, "completions/max_length": 17.6, "completions/max_terminated_length": 17.6, "completions/mean_length": 16.15, "completions/mean_terminated_length": 16.15, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.4592959295929593, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4172565020620822, "learning_rate": 3.279159326679822e-06, "loss": 0.0, "num_tokens": 9599305.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6680 }, { "completion_length": 18.6, "completions/clipped_ratio": 0.0, "completions/max_length": 19.2, "completions/max_terminated_length": 19.2, "completions/mean_length": 17.725, "completions/mean_terminated_length": 17.725, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.459983498349835, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0378253124654293, "learning_rate": 3.2734555108232545e-06, "loss": 0.0, "num_tokens": 9612382.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6690 }, { "completion_length": 19.275, "completions/clipped_ratio": 0.0, "completions/max_length": 18.9, "completions/max_terminated_length": 18.9, "completions/mean_length": 17.225, "completions/mean_terminated_length": 17.225, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.4606710671067107, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1182557418942451, "learning_rate": 3.2677472392051573e-06, "loss": 0.0, "num_tokens": 9626163.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6700 }, { "completion_length": 17.475, "completions/clipped_ratio": 0.0, "completions/max_length": 17.1, "completions/max_terminated_length": 17.1, "completions/mean_length": 16.425, "completions/mean_terminated_length": 16.425, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.4613586358635864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1556994292885066, "learning_rate": 3.262034544710029e-06, "loss": 0.0, "num_tokens": 9640684.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6710 }, { "completion_length": 17.9, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 17.625, "completions/mean_terminated_length": 17.625, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.46204620462046203, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.38791627548635, "learning_rate": 3.2563174602478476e-06, "loss": 0.0, "num_tokens": 9655129.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6720 }, { "completion_length": 23.075, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.4, "completions/mean_terminated_length": 20.4, "completions/min_length": 17.8, "completions/min_terminated_length": 17.8, "epoch": 0.46273377337733773, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2766552977263927, "learning_rate": 3.250596018753882e-06, "loss": 0.0001, "num_tokens": 9668813.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6730 }, { "completion_length": 19.45, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.075, "completions/mean_terminated_length": 16.075, "completions/min_length": 14.4, "completions/min_terminated_length": 14.4, "epoch": 0.46342134213421343, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0593340501189232, "learning_rate": 3.2448702531885026e-06, "loss": 0.0, "num_tokens": 9683592.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6740 }, { "completion_length": 19.175, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 16.375, "completions/mean_terminated_length": 16.375, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.46410891089108913, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2263304449617862, "learning_rate": 3.239140196536986e-06, "loss": 0.0, "num_tokens": 9699155.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6750 }, { "completion_length": 18.025, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 17.175, "completions/mean_terminated_length": 17.175, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.4647964796479648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9894723169505596, "learning_rate": 3.2334058818093335e-06, "loss": 0.0, "num_tokens": 9710874.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6760 }, { "completion_length": 16.625, "completions/clipped_ratio": 0.0, "completions/max_length": 16.1, "completions/max_terminated_length": 16.1, "completions/mean_length": 15.675, "completions/mean_terminated_length": 15.675, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.4654840484048405, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1933510966598988, "learning_rate": 3.227667342040074e-06, "loss": 0.0, "num_tokens": 9726493.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6770 }, { "completion_length": 18.15, "completions/clipped_ratio": 0.0, "completions/max_length": 18.3, "completions/max_terminated_length": 18.3, "completions/mean_length": 16.675, "completions/mean_terminated_length": 16.675, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.4661716171617162, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.221380577981472, "learning_rate": 3.221924610288077e-06, "loss": 0.0, "num_tokens": 9742132.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6780 }, { "completion_length": 19.15, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.675, "completions/mean_terminated_length": 17.675, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.4668591859185919, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0848033614456654, "learning_rate": 3.216177719636362e-06, "loss": 0.0, "num_tokens": 9755131.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6790 }, { "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.025, "completions/mean_terminated_length": 17.025, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.4675467546754675, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2933375760912895, "learning_rate": 3.2104267031919057e-06, "loss": 0.0, "num_tokens": 9768864.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6800 }, { "completion_length": 17.7, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.075, "completions/mean_terminated_length": 16.075, "completions/min_length": 14.6, "completions/min_terminated_length": 14.6, "epoch": 0.4682343234323432, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3907061845064164, "learning_rate": 3.2046715940854534e-06, "loss": 0.0, "num_tokens": 9783835.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6810 }, { "completion_length": 18.3, "completions/clipped_ratio": 0.0, "completions/max_length": 18.3, "completions/max_terminated_length": 18.3, "completions/mean_length": 16.575, "completions/mean_terminated_length": 16.575, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.4689218921892189, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0086625922849635, "learning_rate": 3.1989124254713294e-06, "loss": 0.0, "num_tokens": 9795970.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6820 }, { "completion_length": 18.625, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.375, "completions/mean_terminated_length": 17.375, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.4696094609460946, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1297685861587525, "learning_rate": 3.193149230527242e-06, "loss": 0.0, "num_tokens": 9811737.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6830 }, { "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 17.775, "completions/mean_terminated_length": 17.775, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.47029702970297027, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.236744450032711, "learning_rate": 3.1873820424540957e-06, "loss": 0.0, "num_tokens": 9824248.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6840 }, { "completion_length": 17.3, "completions/clipped_ratio": 0.0, "completions/max_length": 17.6, "completions/max_terminated_length": 17.6, "completions/mean_length": 16.35, "completions/mean_terminated_length": 16.35, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.47098459845984597, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2169081203639507, "learning_rate": 3.181610894475798e-06, "loss": 0.0, "num_tokens": 9840950.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6850 }, { "completion_length": 16.65, "completions/clipped_ratio": 0.0, "completions/max_length": 16.5, "completions/max_terminated_length": 16.5, "completions/mean_length": 15.65, "completions/mean_terminated_length": 15.65, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.47167216721672167, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1514482542872428, "learning_rate": 3.1758358198390697e-06, "loss": 0.0, "num_tokens": 9854748.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6860 }, { "completion_length": 19.3, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 18.3, "completions/mean_terminated_length": 18.3, "completions/min_length": 17.5, "completions/min_terminated_length": 17.5, "epoch": 0.47235973597359737, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2569075137376786, "learning_rate": 3.1700568518132522e-06, "loss": 0.0, "num_tokens": 9869012.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6870 }, { "completion_length": 18.775, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 16.7, "completions/mean_terminated_length": 16.7, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.4730473047304731, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.232297607511282, "learning_rate": 3.164274023690116e-06, "loss": 0.0, "num_tokens": 9883672.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6880 }, { "completion_length": 19.05, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.625, "completions/mean_terminated_length": 16.625, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.4737348734873487, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1718416571617127, "learning_rate": 3.158487368783669e-06, "loss": 0.0, "num_tokens": 9898125.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6890 }, { "completion_length": 18.125, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 17.175, "completions/mean_terminated_length": 17.175, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.4744224422442244, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9953506259247661, "learning_rate": 3.1526969204299655e-06, "loss": 0.0, "num_tokens": 9913588.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6900 }, { "completion_length": 20.325, "completions/clipped_ratio": 0.0, "completions/max_length": 20.1, "completions/max_terminated_length": 20.1, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.1, "completions/min_terminated_length": 17.1, "epoch": 0.4751100110011001, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0354203306138516, "learning_rate": 3.146902711986911e-06, "loss": 0.0, "num_tokens": 9926792.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6910 }, { "completion_length": 17.525, "completions/clipped_ratio": 0.0, "completions/max_length": 17.9, "completions/max_terminated_length": 17.9, "completions/mean_length": 16.4, "completions/mean_terminated_length": 16.4, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.4757975797579758, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1262274965643884, "learning_rate": 3.141104776834076e-06, "loss": 0.0, "num_tokens": 9941716.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6920 }, { "completion_length": 21.075, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.175, "completions/mean_terminated_length": 18.175, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.47648514851485146, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9324771128594875, "learning_rate": 3.135303148372496e-06, "loss": 0.0, "num_tokens": 9953975.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6930 }, { "completion_length": 21.075, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.425, "completions/mean_terminated_length": 18.425, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.47717271727172716, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1364629313349723, "learning_rate": 3.1294978600244845e-06, "loss": 0.0, "num_tokens": 9967824.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6940 }, { "completion_length": 20.3, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.95, "completions/mean_terminated_length": 17.95, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.47786028602860287, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2197353936731816, "learning_rate": 3.123688945233442e-06, "loss": 0.0, "num_tokens": 9981918.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6950 }, { "completion_length": 16.9, "completions/clipped_ratio": 0.0, "completions/max_length": 16.9, "completions/max_terminated_length": 16.9, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.47854785478547857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.189404307305813, "learning_rate": 3.117876437463656e-06, "loss": 0.0, "num_tokens": 9994062.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6960 }, { "completion_length": 19.05, "completions/clipped_ratio": 0.0, "completions/max_length": 19.2, "completions/max_terminated_length": 19.2, "completions/mean_length": 16.675, "completions/mean_terminated_length": 16.675, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.4792354235423542, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2797629989683628, "learning_rate": 3.1120603702001143e-06, "loss": 0.0, "num_tokens": 10008837.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6970 }, { "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.4799229922992299, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0676782980561257, "learning_rate": 3.106240776948313e-06, "loss": 0.0, "num_tokens": 10025003.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6980 }, { "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.9, "completions/max_terminated_length": 18.9, "completions/mean_length": 17.3, "completions/mean_terminated_length": 17.3, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.4806105610561056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.479426984488964, "learning_rate": 3.1004176912340554e-06, "loss": 0.0001, "num_tokens": 10039955.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 6990 }, { "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.9, "completions/max_terminated_length": 18.9, "completions/mean_length": 17.425, "completions/mean_terminated_length": 17.425, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.4812981298129813, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1712445795536042, "learning_rate": 3.09459114660327e-06, "loss": 0.0, "num_tokens": 10054504.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7000 }, { "completion_length": 19.15, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.9, "completions/mean_terminated_length": 16.9, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.481985698569857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0692923165857793, "learning_rate": 3.0887611766218066e-06, "loss": 0.0, "num_tokens": 10068732.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7010 }, { "completion_length": 17.125, "completions/clipped_ratio": 0.0, "completions/max_length": 16.6, "completions/max_terminated_length": 16.6, "completions/mean_length": 16.35, "completions/mean_terminated_length": 16.35, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.48267326732673266, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2424277186393737, "learning_rate": 3.0829278148752527e-06, "loss": 0.0, "num_tokens": 10081530.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7020 }, { "completion_length": 15.65, "completions/clipped_ratio": 0.0, "completions/max_length": 16.4, "completions/max_terminated_length": 16.4, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.6, "completions/min_terminated_length": 14.6, "epoch": 0.48336083608360836, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9960786886513233, "learning_rate": 3.0770910949687323e-06, "loss": 0.0, "num_tokens": 10095148.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7030 }, { "completion_length": 17.075, "completions/clipped_ratio": 0.0, "completions/max_length": 16.7, "completions/max_terminated_length": 16.7, "completions/mean_length": 16.2, "completions/mean_terminated_length": 16.2, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.48404840484048406, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2292386896908283, "learning_rate": 3.0712510505267145e-06, "loss": 0.0, "num_tokens": 10108600.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7040 }, { "completion_length": 20.675, "completions/clipped_ratio": 0.0, "completions/max_length": 20.3, "completions/max_terminated_length": 20.3, "completions/mean_length": 17.725, "completions/mean_terminated_length": 17.725, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.48473597359735976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1080425202846527, "learning_rate": 3.0654077151928246e-06, "loss": 0.0, "num_tokens": 10122557.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7050 }, { "completion_length": 19.725, "completions/clipped_ratio": 0.0, "completions/max_length": 20.1, "completions/max_terminated_length": 20.1, "completions/mean_length": 17.575, "completions/mean_terminated_length": 17.575, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.4854235423542354, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0285929311066866, "learning_rate": 3.0595611226296416e-06, "loss": 0.0, "num_tokens": 10137576.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7060 }, { "completion_length": 18.825, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 16.975, "completions/mean_terminated_length": 16.975, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.4861111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.440330797433853, "learning_rate": 3.053711306518511e-06, "loss": 0.0, "num_tokens": 10152791.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7070 }, { "completion_length": 17.65, "completions/clipped_ratio": 0.0, "completions/max_length": 17.5, "completions/max_terminated_length": 17.5, "completions/mean_length": 16.525, "completions/mean_terminated_length": 16.525, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.4867986798679868, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4143182210624219, "learning_rate": 3.0478583005593505e-06, "loss": 0.0001, "num_tokens": 10166484.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7080 }, { "completion_length": 18.775, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 17.05, "completions/mean_terminated_length": 17.05, "completions/min_length": 14.4, "completions/min_terminated_length": 14.4, "epoch": 0.4874862486248625, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2161018647253514, "learning_rate": 3.0420021384704495e-06, "loss": 0.0, "num_tokens": 10180494.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7090 }, { "completion_length": 17.8, "completions/clipped_ratio": 0.0, "completions/max_length": 17.5, "completions/max_terminated_length": 17.5, "completions/mean_length": 16.125, "completions/mean_terminated_length": 16.125, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.48817381738173815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0749999246094375, "learning_rate": 3.0361428539882833e-06, "loss": 0.0, "num_tokens": 10192911.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7100 }, { "completion_length": 17.975, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 16.775, "completions/mean_terminated_length": 16.775, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.48886138613861385, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.36263635084033, "learning_rate": 3.0302804808673147e-06, "loss": 0.0, "num_tokens": 10206210.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7110 }, { "completion_length": 19.775, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 17.65, "completions/mean_terminated_length": 17.65, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.48954895489548955, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2288992650806905, "learning_rate": 3.024415052879796e-06, "loss": 0.0, "num_tokens": 10220008.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7120 }, { "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 16.9, "completions/mean_terminated_length": 16.9, "completions/min_length": 14.6, "completions/min_terminated_length": 14.6, "epoch": 0.49023652365236525, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1534809999167919, "learning_rate": 3.018546603815582e-06, "loss": 0.0, "num_tokens": 10233224.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7130 }, { "completion_length": 17.875, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.4909240924092409, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2831963915377855, "learning_rate": 3.0126751674819315e-06, "loss": 0.0, "num_tokens": 10248732.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7140 }, { "completion_length": 20.225, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.35, "completions/mean_terminated_length": 17.35, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.4916116611661166, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9669013649225235, "learning_rate": 3.006800777703309e-06, "loss": 0.0, "num_tokens": 10262246.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7150 }, { "completion_length": 19.575, "completions/clipped_ratio": 0.0, "completions/max_length": 20.4, "completions/max_terminated_length": 20.4, "completions/mean_length": 18.025, "completions/mean_terminated_length": 18.025, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.4922992299229923, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0742177367210388, "learning_rate": 3.000923468321197e-06, "loss": 0.0, "num_tokens": 10278631.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7160 }, { "completion_length": 19.45, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.492986798679868, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0535475328564643, "learning_rate": 2.995043273193895e-06, "loss": 0.0, "num_tokens": 10293887.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7170 }, { "completion_length": 18.7, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 16.675, "completions/mean_terminated_length": 16.675, "completions/min_length": 14.3, "completions/min_terminated_length": 14.3, "epoch": 0.4936743674367437, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9863342270255089, "learning_rate": 2.9891602261963275e-06, "loss": 0.0, "num_tokens": 10307234.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7180 }, { "completion_length": 19.6, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.6, "completions/mean_terminated_length": 17.6, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.49436193619361934, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.35015000551939, "learning_rate": 2.9832743612198495e-06, "loss": 0.0001, "num_tokens": 10320246.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7190 }, { "completion_length": 19.475, "completions/clipped_ratio": 0.0, "completions/max_length": 19.7, "completions/max_terminated_length": 19.7, "completions/mean_length": 17.075, "completions/mean_terminated_length": 17.075, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.49504950495049505, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.5178268268704413, "learning_rate": 2.977385712172049e-06, "loss": 0.0001, "num_tokens": 10333833.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7200 }, { "completion_length": 19.775, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 17.85, "completions/mean_terminated_length": 17.85, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.49573707370737075, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9287362463772297, "learning_rate": 2.971494312976551e-06, "loss": 0.0, "num_tokens": 10346595.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7210 }, { "completion_length": 20.95, "completions/clipped_ratio": 0.0, "completions/max_length": 21.4, "completions/max_terminated_length": 21.4, "completions/mean_length": 18.3, "completions/mean_terminated_length": 18.3, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.49642464246424645, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0673484843224288, "learning_rate": 2.965600197572827e-06, "loss": 0.0, "num_tokens": 10361299.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7220 }, { "completion_length": 19.7, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 17.3, "completions/mean_terminated_length": 17.3, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.4971122112211221, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1594716899096966, "learning_rate": 2.9597033999159937e-06, "loss": 0.0, "num_tokens": 10377179.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7230 }, { "completion_length": 20.05, "completions/clipped_ratio": 0.0, "completions/max_length": 20.8, "completions/max_terminated_length": 20.8, "completions/mean_length": 19.575, "completions/mean_terminated_length": 19.575, "completions/min_length": 18.5, "completions/min_terminated_length": 18.5, "epoch": 0.4977997799779978, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.058990754187107, "learning_rate": 2.95380395397662e-06, "loss": 0.0, "num_tokens": 10392714.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7240 }, { "completion_length": 21.225, "completions/clipped_ratio": 0.0, "completions/max_length": 20.7, "completions/max_terminated_length": 20.7, "completions/mean_length": 17.825, "completions/mean_terminated_length": 17.825, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.4984873487348735, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.211721307784319, "learning_rate": 2.9479018937405323e-06, "loss": 0.0, "num_tokens": 10407355.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7250 }, { "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 17.875, "completions/mean_terminated_length": 17.875, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.4991749174917492, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4011707127094268, "learning_rate": 2.941997253208617e-06, "loss": 0.0, "num_tokens": 10421050.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7260 }, { "completion_length": 19.85, "completions/clipped_ratio": 0.0, "completions/max_length": 20.6, "completions/max_terminated_length": 20.6, "completions/mean_length": 17.575, "completions/mean_terminated_length": 17.575, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.49986248624862484, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0872914239764213, "learning_rate": 2.936090066396625e-06, "loss": 0.0, "num_tokens": 10435561.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7270 }, { "completion_length": 19.675, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.8, "completions/mean_terminated_length": 17.8, "completions/min_length": 16.7, "completions/min_terminated_length": 16.7, "epoch": 0.5005500550055005, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3806903678923845, "learning_rate": 2.930180367334976e-06, "loss": 0.0001, "num_tokens": 10450973.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7280 }, { "completion_length": 18.875, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 16.775, "completions/mean_terminated_length": 16.775, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.5012376237623762, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3714930087327957, "learning_rate": 2.924268190068563e-06, "loss": 0.0, "num_tokens": 10464744.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7290 }, { "completion_length": 17.675, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 17.225, "completions/mean_terminated_length": 17.225, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.5019251925192519, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2005647614598274, "learning_rate": 2.9183535686565556e-06, "loss": 0.0, "num_tokens": 10481285.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7300 }, { "completion_length": 18.9, "completions/clipped_ratio": 0.0, "completions/max_length": 18.9, "completions/max_terminated_length": 18.9, "completions/mean_length": 15.85, "completions/mean_terminated_length": 15.85, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.5026127612761276, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0717236608266831, "learning_rate": 2.9124365371722007e-06, "loss": 0.0, "num_tokens": 10496979.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7310 }, { "completion_length": 19.825, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.85, "completions/mean_terminated_length": 16.85, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.5033003300330033, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.223651571944356, "learning_rate": 2.9065171297026352e-06, "loss": 0.0, "num_tokens": 10512065.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7320 }, { "completion_length": 19.3, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.503987898789879, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1987101197242738, "learning_rate": 2.900595380348678e-06, "loss": 0.0, "num_tokens": 10528943.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7330 }, { "completion_length": 19.325, "completions/clipped_ratio": 0.0, "completions/max_length": 19.1, "completions/max_terminated_length": 19.1, "completions/mean_length": 16.925, "completions/mean_terminated_length": 16.925, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.5046754675467546, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1005229383707047, "learning_rate": 2.894671323224642e-06, "loss": 0.0, "num_tokens": 10543040.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7340 }, { "completion_length": 19.625, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.825, "completions/mean_terminated_length": 17.825, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.5053630363036303, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.284407775849104, "learning_rate": 2.8887449924581353e-06, "loss": 0.0, "num_tokens": 10558105.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7350 }, { "completion_length": 17.65, "completions/clipped_ratio": 0.0, "completions/max_length": 17.5, "completions/max_terminated_length": 17.5, "completions/mean_length": 15.65, "completions/mean_terminated_length": 15.65, "completions/min_length": 13.9, "completions/min_terminated_length": 13.9, "epoch": 0.506050605060506, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2672508627176284, "learning_rate": 2.882816422189862e-06, "loss": 0.0, "num_tokens": 10572983.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7360 }, { "completion_length": 18.275, "completions/clipped_ratio": 0.0, "completions/max_length": 18.2, "completions/max_terminated_length": 18.2, "completions/mean_length": 16.625, "completions/mean_terminated_length": 16.625, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.5067381738173817, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.140646267682314, "learning_rate": 2.876885646573429e-06, "loss": 0.0, "num_tokens": 10583724.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7370 }, { "completion_length": 17.1, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 16.025, "completions/mean_terminated_length": 16.025, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.5074257425742574, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1728091657161712, "learning_rate": 2.870952699775148e-06, "loss": 0.0, "num_tokens": 10599161.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7380 }, { "completion_length": 17.45, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 16.4, "completions/mean_terminated_length": 16.4, "completions/min_length": 14.6, "completions/min_terminated_length": 14.6, "epoch": 0.5081133113311331, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9306168712675571, "learning_rate": 2.865017615973839e-06, "loss": 0.0, "num_tokens": 10610849.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7390 }, { "completion_length": 20.275, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 17.6, "completions/mean_terminated_length": 17.6, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.5088008800880088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3517898738384246, "learning_rate": 2.859080429360631e-06, "loss": 0.0001, "num_tokens": 10628073.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7400 }, { "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.925, "completions/mean_terminated_length": 17.925, "completions/min_length": 17.1, "completions/min_terminated_length": 17.1, "epoch": 0.5094884488448845, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1693673767149448, "learning_rate": 2.853141174138768e-06, "loss": 0.0, "num_tokens": 10643486.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7410 }, { "completion_length": 16.45, "completions/clipped_ratio": 0.0, "completions/max_length": 16.6, "completions/max_terminated_length": 16.6, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.5101760176017601, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9463715203106403, "learning_rate": 2.8471998845234126e-06, "loss": 0.0, "num_tokens": 10657128.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7420 }, { "completion_length": 19.625, "completions/clipped_ratio": 0.0, "completions/max_length": 19.1, "completions/max_terminated_length": 19.1, "completions/mean_length": 16.575, "completions/mean_terminated_length": 16.575, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.5108635863586358, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.028014036267996, "learning_rate": 2.8412565947414457e-06, "loss": 0.0, "num_tokens": 10671467.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7430 }, { "completion_length": 18.15, "completions/clipped_ratio": 0.0, "completions/max_length": 19.2, "completions/max_terminated_length": 19.2, "completions/mean_length": 17.65, "completions/mean_terminated_length": 17.65, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.5115511551155115, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2899583131074905, "learning_rate": 2.8353113390312686e-06, "loss": 0.0, "num_tokens": 10684833.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7440 }, { "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 17.65, "completions/mean_terminated_length": 17.65, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.5122387238723872, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2208408184349537, "learning_rate": 2.8293641516426135e-06, "loss": 0.0, "num_tokens": 10700923.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7450 }, { "completion_length": 17.35, "completions/clipped_ratio": 0.0, "completions/max_length": 17.5, "completions/max_terminated_length": 17.5, "completions/mean_length": 16.125, "completions/mean_terminated_length": 16.125, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.5129262926292629, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4087644547224045, "learning_rate": 2.8234150668363347e-06, "loss": 0.0, "num_tokens": 10715160.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7460 }, { "completion_length": 20.825, "completions/clipped_ratio": 0.0, "completions/max_length": 20.9, "completions/max_terminated_length": 20.9, "completions/mean_length": 18.825, "completions/mean_terminated_length": 18.825, "completions/min_length": 17.3, "completions/min_terminated_length": 17.3, "epoch": 0.5136138613861386, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1469629868865012, "learning_rate": 2.8174641188842217e-06, "loss": 0.0, "num_tokens": 10729565.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7470 }, { "completion_length": 18.725, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 16.525, "completions/mean_terminated_length": 16.525, "completions/min_length": 14.8, "completions/min_terminated_length": 14.8, "epoch": 0.5143014301430143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.260312558710575, "learning_rate": 2.811511342068796e-06, "loss": 0.0, "num_tokens": 10743850.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7480 }, { "completion_length": 17.675, "completions/clipped_ratio": 0.0, "completions/max_length": 17.9, "completions/max_terminated_length": 17.9, "completions/mean_length": 16.95, "completions/mean_terminated_length": 16.95, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.51498899889989, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1628944411873818, "learning_rate": 2.805556770683114e-06, "loss": 0.0, "num_tokens": 10759216.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7490 }, { "completion_length": 18.6, "completions/clipped_ratio": 0.0, "completions/max_length": 18.9, "completions/max_terminated_length": 18.9, "completions/mean_length": 16.625, "completions/mean_terminated_length": 16.625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.5156765676567657, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1406783670186997, "learning_rate": 2.7996004390305697e-06, "loss": 0.0, "num_tokens": 10773837.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7500 }, { "completion_length": 17.85, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 16.3, "completions/mean_terminated_length": 16.3, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.5163641364136413, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3061078637838364, "learning_rate": 2.793642381424702e-06, "loss": 0.0, "num_tokens": 10790833.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7510 }, { "completion_length": 17.975, "completions/clipped_ratio": 0.0, "completions/max_length": 19.1, "completions/max_terminated_length": 19.1, "completions/mean_length": 16.525, "completions/mean_terminated_length": 16.525, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.517051705170517, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3148225136101246, "learning_rate": 2.787682632188987e-06, "loss": 0.0, "num_tokens": 10804014.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7520 }, { "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.9, "completions/max_terminated_length": 18.9, "completions/mean_length": 17.825, "completions/mean_terminated_length": 17.825, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.5177392739273927, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0353169120848178, "learning_rate": 2.781721225656651e-06, "loss": 0.0, "num_tokens": 10818775.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7530 }, { "completion_length": 19.775, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.45, "completions/mean_terminated_length": 17.45, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.5184268426842684, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0294340495020151, "learning_rate": 2.775758196170464e-06, "loss": 0.0, "num_tokens": 10834281.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7540 }, { "completion_length": 19.7, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.2, "completions/mean_terminated_length": 18.2, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.5191144114411441, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.150313251465559, "learning_rate": 2.769793578082547e-06, "loss": 0.0, "num_tokens": 10850957.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7550 }, { "completion_length": 20.65, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 16.9, "completions/min_terminated_length": 16.9, "epoch": 0.5198019801980198, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8891532845795155, "learning_rate": 2.7638274057541735e-06, "loss": 0.0, "num_tokens": 10866561.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7560 }, { "completion_length": 19.525, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 17.9, "completions/mean_terminated_length": 17.9, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.5204895489548955, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.353417307138443, "learning_rate": 2.75785971355557e-06, "loss": 0.0, "num_tokens": 10882493.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7570 }, { "completion_length": 19.85, "completions/clipped_ratio": 0.0, "completions/max_length": 19.7, "completions/max_terminated_length": 19.7, "completions/mean_length": 17.775, "completions/mean_terminated_length": 17.775, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.5211771177117712, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3048854753375054, "learning_rate": 2.7518905358657184e-06, "loss": 0.0, "num_tokens": 10897972.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7580 }, { "completion_length": 20.6, "completions/clipped_ratio": 0.0, "completions/max_length": 20.6, "completions/max_terminated_length": 20.6, "completions/mean_length": 18.55, "completions/mean_terminated_length": 18.55, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.5218646864686468, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2213730655610562, "learning_rate": 2.7459199070721593e-06, "loss": 0.0, "num_tokens": 10911370.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7590 }, { "completion_length": 18.725, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 16.55, "completions/mean_terminated_length": 16.55, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.5225522552255225, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4572544232010842, "learning_rate": 2.7399478615707923e-06, "loss": 0.0, "num_tokens": 10927892.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7600 }, { "completion_length": 17.8, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 17.325, "completions/mean_terminated_length": 17.325, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.5232398239823982, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.5330557361245156, "learning_rate": 2.7339744337656783e-06, "loss": 0.0001, "num_tokens": 10940449.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7610 }, { "completion_length": 21.425, "completions/clipped_ratio": 0.0, "completions/max_length": 21.2, "completions/max_terminated_length": 21.2, "completions/mean_length": 17.575, "completions/mean_terminated_length": 17.575, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.5239273927392739, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8406905103474855, "learning_rate": 2.72799965806884e-06, "loss": 0.0, "num_tokens": 10954480.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7620 }, { "completion_length": 21.3, "completions/clipped_ratio": 0.0, "completions/max_length": 20.7, "completions/max_terminated_length": 20.7, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.5246149614961496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.22895505130291, "learning_rate": 2.7220235689000694e-06, "loss": 0.0, "num_tokens": 10969768.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7630 }, { "completion_length": 20.4, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.3, "completions/mean_terminated_length": 17.3, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.5253025302530253, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0801552880555392, "learning_rate": 2.716046200686721e-06, "loss": 0.0, "num_tokens": 10983596.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7640 }, { "completion_length": 16.95, "completions/clipped_ratio": 0.0, "completions/max_length": 16.2, "completions/max_terminated_length": 16.2, "completions/mean_length": 15.775, "completions/mean_terminated_length": 15.775, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.525990099009901, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0529656421393156, "learning_rate": 2.710067587863519e-06, "loss": 0.0, "num_tokens": 10998903.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7650 }, { "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.15, "completions/mean_terminated_length": 17.15, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.5266776677667767, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9225304060149938, "learning_rate": 2.7040877648723585e-06, "loss": 0.0, "num_tokens": 11011201.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7660 }, { "completion_length": 19.9, "completions/clipped_ratio": 0.0, "completions/max_length": 19.9, "completions/max_terminated_length": 19.9, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.8, "completions/min_terminated_length": 16.8, "epoch": 0.5273652365236524, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3096301540732385, "learning_rate": 2.6981067661621054e-06, "loss": 0.0, "num_tokens": 11023939.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7670 }, { "completion_length": 20.475, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.05, "completions/mean_terminated_length": 18.05, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.528052805280528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0370046388357879, "learning_rate": 2.6921246261883977e-06, "loss": 0.0, "num_tokens": 11037705.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7680 }, { "completion_length": 19.525, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.5287403740374037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1600284928455948, "learning_rate": 2.686141379413451e-06, "loss": 0.0, "num_tokens": 11052973.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7690 }, { "completion_length": 18.3, "completions/clipped_ratio": 0.0, "completions/max_length": 18.3, "completions/max_terminated_length": 18.3, "completions/mean_length": 16.3, "completions/mean_terminated_length": 16.3, "completions/min_length": 14.2, "completions/min_terminated_length": 14.2, "epoch": 0.5294279427942794, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2098301276564598, "learning_rate": 2.680157060305854e-06, "loss": 0.0, "num_tokens": 11066413.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7700 }, { "completion_length": 16.35, "completions/clipped_ratio": 0.0, "completions/max_length": 16.5, "completions/max_terminated_length": 16.5, "completions/mean_length": 16.325, "completions/mean_terminated_length": 16.325, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.5301155115511551, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.007573515176773, "learning_rate": 2.674171703340374e-06, "loss": 0.0, "num_tokens": 11082862.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7710 }, { "completion_length": 19.425, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 17.55, "completions/mean_terminated_length": 17.55, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.5308030803080308, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3090703263878822, "learning_rate": 2.6681853429977583e-06, "loss": 0.0, "num_tokens": 11099780.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7720 }, { "completion_length": 18.425, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 17.425, "completions/mean_terminated_length": 17.425, "completions/min_length": 16.7, "completions/min_terminated_length": 16.7, "epoch": 0.5314906490649065, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4191451646387576, "learning_rate": 2.662198013764533e-06, "loss": 0.0, "num_tokens": 11115345.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7730 }, { "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.2, "completions/max_terminated_length": 17.2, "completions/mean_length": 16.6, "completions/mean_terminated_length": 16.6, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.5321782178217822, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2996338605880737, "learning_rate": 2.6562097501328072e-06, "loss": 0.0, "num_tokens": 11127985.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7740 }, { "completion_length": 17.225, "completions/clipped_ratio": 0.0, "completions/max_length": 17.6, "completions/max_terminated_length": 17.6, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.5328657865786579, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2814787819981575, "learning_rate": 2.6502205866000703e-06, "loss": 0.0, "num_tokens": 11141819.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7750 }, { "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.6, "completions/max_terminated_length": 17.6, "completions/mean_length": 16.725, "completions/mean_terminated_length": 16.725, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.5335533553355336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.133877544477582, "learning_rate": 2.6442305576689996e-06, "loss": 0.0, "num_tokens": 11159004.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7760 }, { "completion_length": 18.7, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 16.675, "completions/mean_terminated_length": 16.675, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.5342409240924092, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2555125415325166, "learning_rate": 2.638239697847255e-06, "loss": 0.0, "num_tokens": 11172859.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7770 }, { "completion_length": 17.1, "completions/clipped_ratio": 0.0, "completions/max_length": 17.1, "completions/max_terminated_length": 17.1, "completions/mean_length": 15.7, "completions/mean_terminated_length": 15.7, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.5349284928492849, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1996543176472187, "learning_rate": 2.6322480416472844e-06, "loss": 0.0, "num_tokens": 11185251.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7780 }, { "completion_length": 16.85, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.725, "completions/mean_terminated_length": 15.725, "completions/min_length": 14.4, "completions/min_terminated_length": 14.4, "epoch": 0.5356160616061606, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1012627463787794, "learning_rate": 2.6262556235861237e-06, "loss": 0.0, "num_tokens": 11199704.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7790 }, { "completion_length": 18.7, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 17.025, "completions/mean_terminated_length": 17.025, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.5363036303630363, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3546480126678944, "learning_rate": 2.6202624781851963e-06, "loss": 0.0, "num_tokens": 11214753.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7800 }, { "completion_length": 16.3, "completions/clipped_ratio": 0.0, "completions/max_length": 16.3, "completions/max_terminated_length": 16.3, "completions/mean_length": 15.9, "completions/mean_terminated_length": 15.9, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.536991199119912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0696083324030041, "learning_rate": 2.614268639970116e-06, "loss": 0.0, "num_tokens": 11228825.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7810 }, { "completion_length": 17.825, "completions/clipped_ratio": 0.0, "completions/max_length": 17.6, "completions/max_terminated_length": 17.6, "completions/mean_length": 16.35, "completions/mean_terminated_length": 16.35, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.5376787678767877, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.09154290035367, "learning_rate": 2.6082741434704912e-06, "loss": 0.0, "num_tokens": 11242387.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7820 }, { "completion_length": 18.875, "completions/clipped_ratio": 0.0, "completions/max_length": 19.1, "completions/max_terminated_length": 19.1, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.5383663366336634, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0892216626554727, "learning_rate": 2.6022790232197164e-06, "loss": 0.0, "num_tokens": 11255895.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7830 }, { "completion_length": 17.775, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 16.025, "completions/mean_terminated_length": 16.025, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.5390539053905391, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1405236177146434, "learning_rate": 2.5962833137547842e-06, "loss": 0.0, "num_tokens": 11269772.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7840 }, { "completion_length": 17.075, "completions/clipped_ratio": 0.0, "completions/max_length": 17.9, "completions/max_terminated_length": 17.9, "completions/mean_length": 16.775, "completions/mean_terminated_length": 16.775, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.5397414741474147, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2529190577566625, "learning_rate": 2.5902870496160813e-06, "loss": 0.0, "num_tokens": 11282427.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7850 }, { "completion_length": 19.425, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.5404290429042904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.181235571205616, "learning_rate": 2.5842902653471867e-06, "loss": 0.0, "num_tokens": 11295581.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7860 }, { "completion_length": 16.4, "completions/clipped_ratio": 0.0, "completions/max_length": 16.7, "completions/max_terminated_length": 16.7, "completions/mean_length": 15.525, "completions/mean_terminated_length": 15.525, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.5411166116611661, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1392400201410056, "learning_rate": 2.5782929954946787e-06, "loss": 0.0, "num_tokens": 11308422.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7870 }, { "completion_length": 21.125, "completions/clipped_ratio": 0.0, "completions/max_length": 21.2, "completions/max_terminated_length": 21.2, "completions/mean_length": 18.475, "completions/mean_terminated_length": 18.475, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.5418041804180418, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9744973611086607, "learning_rate": 2.5722952746079333e-06, "loss": 0.0, "num_tokens": 11321961.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7880 }, { "completion_length": 17.925, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 16.075, "completions/mean_terminated_length": 16.075, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.5424917491749175, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8853171467781067, "learning_rate": 2.5662971372389213e-06, "loss": 0.0, "num_tokens": 11338704.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7890 }, { "completion_length": 19.575, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 17.8, "completions/mean_terminated_length": 17.8, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.5431793179317932, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0291051536798477, "learning_rate": 2.560298617942017e-06, "loss": 0.0, "num_tokens": 11351496.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7900 }, { "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.7, "completions/max_terminated_length": 19.7, "completions/mean_length": 17.4, "completions/mean_terminated_length": 17.4, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.5438668866886689, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.09465791285038, "learning_rate": 2.5542997512737914e-06, "loss": 0.0, "num_tokens": 11367568.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7910 }, { "completion_length": 19.45, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.4, "completions/mean_terminated_length": 16.4, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.5445544554455446, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2983490988612174, "learning_rate": 2.5483005717928193e-06, "loss": 0.0, "num_tokens": 11382484.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7920 }, { "completion_length": 21.65, "completions/clipped_ratio": 0.0, "completions/max_length": 21.8, "completions/max_terminated_length": 21.8, "completions/mean_length": 18.875, "completions/mean_terminated_length": 18.875, "completions/min_length": 16.8, "completions/min_terminated_length": 16.8, "epoch": 0.5452420242024203, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0815192684531212, "learning_rate": 2.542301114059476e-06, "loss": 0.0, "num_tokens": 11397067.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7930 }, { "completion_length": 18.825, "completions/clipped_ratio": 0.0, "completions/max_length": 18.3, "completions/max_terminated_length": 18.3, "completions/mean_length": 17.125, "completions/mean_terminated_length": 17.125, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.5459295929592959, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.186297105997801, "learning_rate": 2.5363014126357416e-06, "loss": 0.0, "num_tokens": 11412512.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7940 }, { "completion_length": 19.525, "completions/clipped_ratio": 0.0, "completions/max_length": 19.9, "completions/max_terminated_length": 19.9, "completions/mean_length": 17.95, "completions/mean_terminated_length": 17.95, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.5466171617161716, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2355756394565105, "learning_rate": 2.5303015020849963e-06, "loss": 0.0, "num_tokens": 11429502.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7950 }, { "completion_length": 20.225, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.725, "completions/mean_terminated_length": 17.725, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.5473047304730473, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.404906328767538, "learning_rate": 2.52430141697183e-06, "loss": 0.0001, "num_tokens": 11444831.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7960 }, { "completion_length": 17.975, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 17.45, "completions/mean_terminated_length": 17.45, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.547992299229923, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2544807229191064, "learning_rate": 2.5183011918618343e-06, "loss": 0.0, "num_tokens": 11461913.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7970 }, { "completion_length": 17.575, "completions/clipped_ratio": 0.0, "completions/max_length": 16.9, "completions/max_terminated_length": 16.9, "completions/mean_length": 16.125, "completions/mean_terminated_length": 16.125, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.5486798679867987, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1397920138202609, "learning_rate": 2.5123008613214093e-06, "loss": 0.0, "num_tokens": 11475142.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7980 }, { "completion_length": 17.125, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.5493674367436744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.439752745628357, "learning_rate": 2.5063004599175617e-06, "loss": 0.0, "num_tokens": 11490114.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 7990 }, { "completion_length": 17.725, "completions/clipped_ratio": 0.0, "completions/max_length": 17.2, "completions/max_terminated_length": 17.2, "completions/mean_length": 16.125, "completions/mean_terminated_length": 16.125, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.5500550055005501, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.17640033736825, "learning_rate": 2.5003000222177073e-06, "loss": 0.0, "num_tokens": 11503235.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8000 }, { "completion_length": 19.85, "completions/clipped_ratio": 0.0, "completions/max_length": 20.3, "completions/max_terminated_length": 20.3, "completions/mean_length": 18.85, "completions/mean_terminated_length": 18.85, "completions/min_length": 17.5, "completions/min_terminated_length": 17.5, "epoch": 0.5507425742574258, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1319660669192673, "learning_rate": 2.4942995827894694e-06, "loss": 0.0, "num_tokens": 11517157.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8010 }, { "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 16.8, "completions/mean_terminated_length": 16.8, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.5514301430143014, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.386430050432682, "learning_rate": 2.4882991762004825e-06, "loss": 0.0, "num_tokens": 11533377.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8020 }, { "completion_length": 17.075, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.025, "completions/mean_terminated_length": 16.025, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.5521177117711771, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0991659818217159, "learning_rate": 2.482298837018191e-06, "loss": 0.0, "num_tokens": 11546402.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8030 }, { "completion_length": 18.775, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 17.4, "completions/mean_terminated_length": 17.4, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.5528052805280528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2415507107973098, "learning_rate": 2.476298599809653e-06, "loss": 0.0, "num_tokens": 11559974.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8040 }, { "completion_length": 18.6, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 16.625, "completions/mean_terminated_length": 16.625, "completions/min_length": 14.3, "completions/min_terminated_length": 14.3, "epoch": 0.5534928492849285, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3896477714180946, "learning_rate": 2.4702984991413372e-06, "loss": 0.0001, "num_tokens": 11574207.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8050 }, { "completion_length": 18.375, "completions/clipped_ratio": 0.0, "completions/max_length": 18.9, "completions/max_terminated_length": 18.9, "completions/mean_length": 17.6, "completions/mean_terminated_length": 17.6, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.5541804180418042, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3229296140372753, "learning_rate": 2.464298569578925e-06, "loss": 0.0, "num_tokens": 11589047.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8060 }, { "completion_length": 18.35, "completions/clipped_ratio": 0.0, "completions/max_length": 18.2, "completions/max_terminated_length": 18.2, "completions/mean_length": 16.575, "completions/mean_terminated_length": 16.575, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.5548679867986799, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4035261940211057, "learning_rate": 2.458298845687116e-06, "loss": 0.0, "num_tokens": 11602702.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8070 }, { "completion_length": 19.575, "completions/clipped_ratio": 0.0, "completions/max_length": 19.2, "completions/max_terminated_length": 19.2, "completions/mean_length": 16.35, "completions/mean_terminated_length": 16.35, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.5555555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0341587118804454, "learning_rate": 2.4522993620294202e-06, "loss": 0.0, "num_tokens": 11618512.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8080 }, { "completion_length": 17.575, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 16.875, "completions/mean_terminated_length": 16.875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.5562431243124313, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2614828519523145, "learning_rate": 2.446300153167965e-06, "loss": 0.0, "num_tokens": 11634235.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8090 }, { "completion_length": 19.075, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.525, "completions/mean_terminated_length": 17.525, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.556930693069307, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.953551921620965, "learning_rate": 2.4403012536632987e-06, "loss": 0.0, "num_tokens": 11647180.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8100 }, { "completion_length": 17.3, "completions/clipped_ratio": 0.0, "completions/max_length": 17.3, "completions/max_terminated_length": 17.3, "completions/mean_length": 15.95, "completions/mean_terminated_length": 15.95, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.5576182618261826, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2337733075022697, "learning_rate": 2.4343026980741814e-06, "loss": 0.0, "num_tokens": 11661174.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8110 }, { "completion_length": 20.325, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.925, "completions/mean_terminated_length": 18.925, "completions/min_length": 17.2, "completions/min_terminated_length": 17.2, "epoch": 0.5583058305830583, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2463152900338172, "learning_rate": 2.4283045209573954e-06, "loss": 0.0, "num_tokens": 11675591.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8120 }, { "completion_length": 17.825, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.65, "completions/mean_terminated_length": 15.65, "completions/min_length": 14.6, "completions/min_terminated_length": 14.6, "epoch": 0.558993399339934, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1159010961651803, "learning_rate": 2.422306756867543e-06, "loss": 0.0, "num_tokens": 11688109.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8130 }, { "completion_length": 20.475, "completions/clipped_ratio": 0.0, "completions/max_length": 20.7, "completions/max_terminated_length": 20.7, "completions/mean_length": 18.775, "completions/mean_terminated_length": 18.775, "completions/min_length": 17.1, "completions/min_terminated_length": 17.1, "epoch": 0.5596809680968097, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9875838525593281, "learning_rate": 2.4163094403568457e-06, "loss": 0.0, "num_tokens": 11701616.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8140 }, { "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 16.825, "completions/mean_terminated_length": 16.825, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.5603685368536854, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2746304847300052, "learning_rate": 2.4103126059749454e-06, "loss": 0.0, "num_tokens": 11715369.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8150 }, { "completion_length": 21.225, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.275, "completions/mean_terminated_length": 19.275, "completions/min_length": 17.6, "completions/min_terminated_length": 17.6, "epoch": 0.5610561056105611, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2565015479922295, "learning_rate": 2.4043162882687117e-06, "loss": 0.0001, "num_tokens": 11728984.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8160 }, { "completion_length": 19.2, "completions/clipped_ratio": 0.0, "completions/max_length": 18.9, "completions/max_terminated_length": 18.9, "completions/mean_length": 16.825, "completions/mean_terminated_length": 16.825, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.5617436743674368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.031527753919363, "learning_rate": 2.398320521782031e-06, "loss": 0.0, "num_tokens": 11744173.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8170 }, { "completion_length": 18.3, "completions/clipped_ratio": 0.0, "completions/max_length": 18.3, "completions/max_terminated_length": 18.3, "completions/mean_length": 16.9, "completions/mean_terminated_length": 16.9, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.5624312431243125, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2719090670347213, "learning_rate": 2.392325341055619e-06, "loss": 0.0, "num_tokens": 11758569.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8180 }, { "completion_length": 20.55, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.025, "completions/mean_terminated_length": 19.025, "completions/min_length": 17.9, "completions/min_terminated_length": 17.9, "epoch": 0.5631188118811881, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1528592348098754, "learning_rate": 2.386330780626814e-06, "loss": 0.0, "num_tokens": 11772394.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8190 }, { "completion_length": 18.55, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 17.325, "completions/mean_terminated_length": 17.325, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.5638063806380638, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.281024768948555, "learning_rate": 2.3803368750293835e-06, "loss": 0.0, "num_tokens": 11787575.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8200 }, { "completion_length": 19.6, "completions/clipped_ratio": 0.0, "completions/max_length": 19.9, "completions/max_terminated_length": 19.9, "completions/mean_length": 18.125, "completions/mean_terminated_length": 18.125, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.5644939493949395, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.341177748888731, "learning_rate": 2.374343658793321e-06, "loss": 0.0, "num_tokens": 11801456.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8210 }, { "completion_length": 19.675, "completions/clipped_ratio": 0.0, "completions/max_length": 20.2, "completions/max_terminated_length": 20.2, "completions/mean_length": 18.45, "completions/mean_terminated_length": 18.45, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.5651815181518152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1610834196209907, "learning_rate": 2.368351166444648e-06, "loss": 0.0, "num_tokens": 11816618.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8220 }, { "completion_length": 22.125, "completions/clipped_ratio": 0.0, "completions/max_length": 21.6, "completions/max_terminated_length": 21.6, "completions/mean_length": 18.15, "completions/mean_terminated_length": 18.15, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.5658690869086909, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.237844767421484, "learning_rate": 2.362359432505218e-06, "loss": 0.0001, "num_tokens": 11830048.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8230 }, { "completion_length": 17.175, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 16.3, "completions/mean_terminated_length": 16.3, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.5665566556655666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4094478785991669, "learning_rate": 2.356368491492514e-06, "loss": 0.0, "num_tokens": 11843600.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8240 }, { "completion_length": 19.325, "completions/clipped_ratio": 0.0, "completions/max_length": 19.1, "completions/max_terminated_length": 19.1, "completions/mean_length": 16.8, "completions/mean_terminated_length": 16.8, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.5672442244224423, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.194662444293499, "learning_rate": 2.3503783779194502e-06, "loss": 0.0, "num_tokens": 11858700.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8250 }, { "completion_length": 17.225, "completions/clipped_ratio": 0.0, "completions/max_length": 17.6, "completions/max_terminated_length": 17.6, "completions/mean_length": 16.15, "completions/mean_terminated_length": 16.15, "completions/min_length": 14.8, "completions/min_terminated_length": 14.8, "epoch": 0.567931793179318, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1682618951890618, "learning_rate": 2.3443891262941763e-06, "loss": 0.0, "num_tokens": 11871618.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8260 }, { "completion_length": 20.225, "completions/clipped_ratio": 0.0, "completions/max_length": 20.3, "completions/max_terminated_length": 20.3, "completions/mean_length": 18.4, "completions/mean_terminated_length": 18.4, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.5686193619361937, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1109918273985386, "learning_rate": 2.338400771119875e-06, "loss": 0.0, "num_tokens": 11884450.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8270 }, { "completion_length": 16.7, "completions/clipped_ratio": 0.0, "completions/max_length": 16.1, "completions/max_terminated_length": 16.1, "completions/mean_length": 15.6, "completions/mean_terminated_length": 15.6, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.5693069306930693, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2410488959401846, "learning_rate": 2.3324133468945636e-06, "loss": 0.0, "num_tokens": 11900106.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8280 }, { "completion_length": 17.925, "completions/clipped_ratio": 0.0, "completions/max_length": 18.3, "completions/max_terminated_length": 18.3, "completions/mean_length": 16.65, "completions/mean_terminated_length": 16.65, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.569994499449945, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1313690185546874, "learning_rate": 2.3264268881109e-06, "loss": 0.0, "num_tokens": 11916548.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8290 }, { "completion_length": 18.125, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 17.275, "completions/mean_terminated_length": 17.275, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.5706820682068207, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8695598168298602, "learning_rate": 2.3204414292559764e-06, "loss": 0.0, "num_tokens": 11929015.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8300 }, { "completion_length": 19.9, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 18.575, "completions/mean_terminated_length": 18.575, "completions/min_length": 17.2, "completions/min_terminated_length": 17.2, "epoch": 0.5713696369636964, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0880703181028366, "learning_rate": 2.3144570048111263e-06, "loss": 0.0, "num_tokens": 11943050.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8310 }, { "completion_length": 17.725, "completions/clipped_ratio": 0.0, "completions/max_length": 17.5, "completions/max_terminated_length": 17.5, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.4, "completions/min_terminated_length": 14.4, "epoch": 0.5720572057205721, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.003603034466505, "learning_rate": 2.308473649251724e-06, "loss": 0.0, "num_tokens": 11957456.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8320 }, { "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.5727447744774478, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8983209150843322, "learning_rate": 2.3024913970469866e-06, "loss": 0.0, "num_tokens": 11973024.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8330 }, { "completion_length": 19.925, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 17.525, "completions/mean_terminated_length": 17.525, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.5734323432343235, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.034299236536026, "learning_rate": 2.296510282659774e-06, "loss": 0.0, "num_tokens": 11988141.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8340 }, { "completion_length": 17.225, "completions/clipped_ratio": 0.0, "completions/max_length": 17.9, "completions/max_terminated_length": 17.9, "completions/mean_length": 16.575, "completions/mean_terminated_length": 16.575, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.5741199119911992, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2049643009901048, "learning_rate": 2.2905303405463916e-06, "loss": 0.0, "num_tokens": 12006072.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8350 }, { "completion_length": 19.025, "completions/clipped_ratio": 0.0, "completions/max_length": 18.2, "completions/max_terminated_length": 18.2, "completions/mean_length": 15.625, "completions/mean_terminated_length": 15.625, "completions/min_length": 14.4, "completions/min_terminated_length": 14.4, "epoch": 0.5748074807480749, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.046896144747734, "learning_rate": 2.284551605156392e-06, "loss": 0.0, "num_tokens": 12022777.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8360 }, { "completion_length": 19.475, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.125, "completions/mean_terminated_length": 18.125, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.5754950495049505, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2499522306025028, "learning_rate": 2.278574110932376e-06, "loss": 0.0, "num_tokens": 12038086.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8370 }, { "completion_length": 16.925, "completions/clipped_ratio": 0.0, "completions/max_length": 16.4, "completions/max_terminated_length": 16.4, "completions/mean_length": 15.45, "completions/mean_terminated_length": 15.45, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.5761826182618262, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2376941472291947, "learning_rate": 2.272597892309793e-06, "loss": 0.0, "num_tokens": 12052140.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8380 }, { "completion_length": 16.8, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 16.55, "completions/mean_terminated_length": 16.55, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.5768701870187019, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0631072670221329, "learning_rate": 2.2666229837167454e-06, "loss": 0.0, "num_tokens": 12063102.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8390 }, { "completion_length": 19.45, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.9, "completions/mean_terminated_length": 17.9, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.5775577557755776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1769488338381051, "learning_rate": 2.2606494195737884e-06, "loss": 0.0, "num_tokens": 12076170.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8400 }, { "completion_length": 17.613333333333333, "completions/clipped_ratio": 0.0, "completions/max_length": 17.72222222222222, "completions/max_terminated_length": 17.72222222222222, "completions/mean_length": 16.930555555555557, "completions/mean_terminated_length": 16.930555555555557, "completions/min_length": 16.166666666666668, "completions/min_terminated_length": 16.166666666666668, "epoch": 0.5782453245324533, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0846174057324727, "learning_rate": 2.2546772342937303e-06, "loss": 0.0, "num_tokens": 12091697.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8410 }, { "completion_length": 18.1, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 17.35, "completions/mean_terminated_length": 17.35, "completions/min_length": 16.9, "completions/min_terminated_length": 16.9, "epoch": 0.578932893289329, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4397614181041718, "learning_rate": 2.2487064622814387e-06, "loss": 0.0, "num_tokens": 12106631.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8420 }, { "completion_length": 20.7, "completions/clipped_ratio": 0.0, "completions/max_length": 20.7, "completions/max_terminated_length": 20.7, "completions/mean_length": 18.7, "completions/mean_terminated_length": 18.7, "completions/min_length": 17.1, "completions/min_terminated_length": 17.1, "epoch": 0.5796204620462047, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2253276860341429, "learning_rate": 2.242737137933638e-06, "loss": 0.0001, "num_tokens": 12123891.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8430 }, { "completion_length": 19.4, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 17.925, "completions/mean_terminated_length": 17.925, "completions/min_length": 16.8, "completions/min_terminated_length": 16.8, "epoch": 0.5803080308030804, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0086985297501088, "learning_rate": 2.236769295638712e-06, "loss": 0.0, "num_tokens": 12138344.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8440 }, { "completion_length": 15.9, "completions/clipped_ratio": 0.0, "completions/max_length": 15.9, "completions/max_terminated_length": 15.9, "completions/mean_length": 14.725, "completions/mean_terminated_length": 14.725, "completions/min_length": 13.9, "completions/min_terminated_length": 13.9, "epoch": 0.5809955995599559, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2906472340226174, "learning_rate": 2.23080296977651e-06, "loss": 0.0, "num_tokens": 12151277.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8450 }, { "completion_length": 16.6, "completions/clipped_ratio": 0.0, "completions/max_length": 16.6, "completions/max_terminated_length": 16.6, "completions/mean_length": 15.825, "completions/mean_terminated_length": 15.825, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.5816831683168316, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2870510045439005, "learning_rate": 2.224838194718141e-06, "loss": 0.0, "num_tokens": 12164082.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8460 }, { "completion_length": 18.2, "completions/clipped_ratio": 0.0, "completions/max_length": 18.2, "completions/max_terminated_length": 18.2, "completions/mean_length": 17.1, "completions/mean_terminated_length": 17.1, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.5823707370737073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3544713765382768, "learning_rate": 2.218875004825783e-06, "loss": 0.0, "num_tokens": 12179754.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8470 }, { "completion_length": 19.1, "completions/clipped_ratio": 0.0, "completions/max_length": 19.1, "completions/max_terminated_length": 19.1, "completions/mean_length": 17.2, "completions/mean_terminated_length": 17.2, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.583058305830583, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.5019911035895348, "learning_rate": 2.21291343445248e-06, "loss": 0.0001, "num_tokens": 12193414.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8480 }, { "completion_length": 18.3, "completions/clipped_ratio": 0.0, "completions/max_length": 18.3, "completions/max_terminated_length": 18.3, "completions/mean_length": 16.45, "completions/mean_terminated_length": 16.45, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.5837458745874587, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1590588353574276, "learning_rate": 2.2069535179419494e-06, "loss": 0.0, "num_tokens": 12206572.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8490 }, { "completion_length": 18.2, "completions/clipped_ratio": 0.0, "completions/max_length": 18.2, "completions/max_terminated_length": 18.2, "completions/mean_length": 16.525, "completions/mean_terminated_length": 16.525, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.5844334433443344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1892656478099526, "learning_rate": 2.2009952896283777e-06, "loss": 0.0, "num_tokens": 12220553.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8500 }, { "completion_length": 20.6, "completions/clipped_ratio": 0.0, "completions/max_length": 20.6, "completions/max_terminated_length": 20.6, "completions/mean_length": 18.4, "completions/mean_terminated_length": 18.4, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.5851210121012101, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.296466251462698, "learning_rate": 2.195038783836227e-06, "loss": 0.0001, "num_tokens": 12235325.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8510 }, { "completion_length": 19.9, "completions/clipped_ratio": 0.0, "completions/max_length": 19.9, "completions/max_terminated_length": 19.9, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.5858085808580858, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1588373471051454, "learning_rate": 2.189084034880037e-06, "loss": 0.0, "num_tokens": 12250901.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8520 }, { "completion_length": 17.1, "completions/clipped_ratio": 0.0, "completions/max_length": 17.1, "completions/max_terminated_length": 17.1, "completions/mean_length": 16.3, "completions/mean_terminated_length": 16.3, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.5864961496149615, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.122246940433979, "learning_rate": 2.1831310770642257e-06, "loss": 0.0, "num_tokens": 12265029.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8530 }, { "completion_length": 19.2, "completions/clipped_ratio": 0.0, "completions/max_length": 19.2, "completions/max_terminated_length": 19.2, "completions/mean_length": 17.8, "completions/mean_terminated_length": 17.8, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.5871837183718371, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4273377619683743, "learning_rate": 2.1771799446828925e-06, "loss": 0.0001, "num_tokens": 12279165.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8540 }, { "completion_length": 19.6, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 17.725, "completions/mean_terminated_length": 17.725, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.5878712871287128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9175143100321292, "learning_rate": 2.171230672019622e-06, "loss": 0.0, "num_tokens": 12293098.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8550 }, { "completion_length": 18.1, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 16.4, "completions/mean_terminated_length": 16.4, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.5885588558855885, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2116529121994972, "learning_rate": 2.165283293347283e-06, "loss": 0.0, "num_tokens": 12310206.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8560 }, { "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 17.3, "completions/mean_terminated_length": 17.3, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.5892464246424642, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9623834818601609, "learning_rate": 2.1593378429278357e-06, "loss": 0.0, "num_tokens": 12327350.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8570 }, { "completion_length": 19.7, "completions/clipped_ratio": 0.0, "completions/max_length": 19.7, "completions/max_terminated_length": 19.7, "completions/mean_length": 18.05, "completions/mean_terminated_length": 18.05, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.5899339933993399, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2423087801784276, "learning_rate": 2.1533943550121307e-06, "loss": 0.0, "num_tokens": 12341784.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8580 }, { "completion_length": 19.1, "completions/clipped_ratio": 0.0, "completions/max_length": 19.1, "completions/max_terminated_length": 19.1, "completions/mean_length": 16.825, "completions/mean_terminated_length": 16.825, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.5906215621562156, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.197791612148285, "learning_rate": 2.147452863839713e-06, "loss": 0.0, "num_tokens": 12357701.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8590 }, { "completion_length": 18.7, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 16.975, "completions/mean_terminated_length": 16.975, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.5913091309130913, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.425983040034771, "learning_rate": 2.1415134036386236e-06, "loss": 0.0, "num_tokens": 12371296.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8600 }, { "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 16.675, "completions/mean_terminated_length": 16.675, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.591996699669967, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8437675893306732, "learning_rate": 2.135576008625206e-06, "loss": 0.0, "num_tokens": 12384111.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8610 }, { "completion_length": 19.3, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 17.9, "completions/mean_terminated_length": 17.9, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.5926842684268426, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.19951608851552, "learning_rate": 2.129640713003905e-06, "loss": 0.0, "num_tokens": 12396699.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8620 }, { "completion_length": 17.7, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 16.3, "completions/mean_terminated_length": 16.3, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.5933718371837183, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4041204899549484, "learning_rate": 2.1237075509670704e-06, "loss": 0.0, "num_tokens": 12409535.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8630 }, { "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.594059405940594, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2162705175578594, "learning_rate": 2.11777655669476e-06, "loss": 0.0, "num_tokens": 12423247.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8640 }, { "completion_length": 16.7, "completions/clipped_ratio": 0.0, "completions/max_length": 16.7, "completions/max_terminated_length": 16.7, "completions/mean_length": 15.425, "completions/mean_terminated_length": 15.425, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.5947469746974697, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2057185828685761, "learning_rate": 2.1118477643545475e-06, "loss": 0.0, "num_tokens": 12438008.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8650 }, { "completion_length": 21.7, "completions/clipped_ratio": 0.0, "completions/max_length": 21.7, "completions/max_terminated_length": 21.7, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.4, "completions/min_terminated_length": 17.4, "epoch": 0.5954345434543454, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1366472691297531, "learning_rate": 2.105921208101318e-06, "loss": 0.0, "num_tokens": 12453968.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8660 }, { "completion_length": 18.6, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 16.975, "completions/mean_terminated_length": 16.975, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.5961221122112211, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.047244779765606, "learning_rate": 2.0999969220770743e-06, "loss": 0.0, "num_tokens": 12467399.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8670 }, { "completion_length": 20.7, "completions/clipped_ratio": 0.0, "completions/max_length": 20.7, "completions/max_terminated_length": 20.7, "completions/mean_length": 18.725, "completions/mean_terminated_length": 18.725, "completions/min_length": 17.3, "completions/min_terminated_length": 17.3, "epoch": 0.5968096809680968, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2203440919518471, "learning_rate": 2.0940749404107446e-06, "loss": 0.0, "num_tokens": 12484184.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8680 }, { "completion_length": 21.2, "completions/clipped_ratio": 0.0, "completions/max_length": 21.2, "completions/max_terminated_length": 21.2, "completions/mean_length": 17.575, "completions/mean_terminated_length": 17.575, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.5974972497249725, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2722304835915565, "learning_rate": 2.088155297217979e-06, "loss": 0.0001, "num_tokens": 12499899.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8690 }, { "completion_length": 20.2, "completions/clipped_ratio": 0.0, "completions/max_length": 20.2, "completions/max_terminated_length": 20.2, "completions/mean_length": 18.275, "completions/mean_terminated_length": 18.275, "completions/min_length": 16.7, "completions/min_terminated_length": 16.7, "epoch": 0.5981848184818482, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1011649891734123, "learning_rate": 2.0822380266009544e-06, "loss": 0.0, "num_tokens": 12514474.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8700 }, { "completion_length": 19.7, "completions/clipped_ratio": 0.0, "completions/max_length": 19.7, "completions/max_terminated_length": 19.7, "completions/mean_length": 18.1, "completions/mean_terminated_length": 18.1, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.5988723872387238, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.114920786768198, "learning_rate": 2.076323162648185e-06, "loss": 0.0, "num_tokens": 12528530.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8710 }, { "completion_length": 17.9, "completions/clipped_ratio": 0.0, "completions/max_length": 17.9, "completions/max_terminated_length": 17.9, "completions/mean_length": 16.55, "completions/mean_terminated_length": 16.55, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.5995599559955995, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2510825961828231, "learning_rate": 2.0704107394343168e-06, "loss": 0.0, "num_tokens": 12542732.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8720 }, { "completion_length": 16.3, "completions/clipped_ratio": 0.0, "completions/max_length": 16.3, "completions/max_terminated_length": 16.3, "completions/mean_length": 15.375, "completions/mean_terminated_length": 15.375, "completions/min_length": 14.4, "completions/min_terminated_length": 14.4, "epoch": 0.6002475247524752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.08427300080657, "learning_rate": 2.064500791019933e-06, "loss": 0.0, "num_tokens": 12557119.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8730 }, { "completion_length": 20.6, "completions/clipped_ratio": 0.0, "completions/max_length": 20.6, "completions/max_terminated_length": 20.6, "completions/mean_length": 17.575, "completions/mean_terminated_length": 17.575, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.6009350935093509, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0661740265786648, "learning_rate": 2.0585933514513667e-06, "loss": 0.0, "num_tokens": 12570742.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8740 }, { "completion_length": 19.4, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 17.275, "completions/mean_terminated_length": 17.275, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.6016226622662266, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1203237427398562, "learning_rate": 2.052688454760492e-06, "loss": 0.0, "num_tokens": 12588153.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8750 }, { "completion_length": 19.4, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.6023102310231023, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3821448743343354, "learning_rate": 2.046786134964535e-06, "loss": 0.0, "num_tokens": 12602619.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8760 }, { "completion_length": 16.7, "completions/clipped_ratio": 0.0, "completions/max_length": 16.7, "completions/max_terminated_length": 16.7, "completions/mean_length": 15.45, "completions/mean_terminated_length": 15.45, "completions/min_length": 14.4, "completions/min_terminated_length": 14.4, "epoch": 0.602997799779978, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9175124347675592, "learning_rate": 2.0408864260658775e-06, "loss": 0.0, "num_tokens": 12614245.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8770 }, { "completion_length": 17.4, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 16.425, "completions/mean_terminated_length": 16.425, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.6036853685368537, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0806362126022577, "learning_rate": 2.0349893620518614e-06, "loss": 0.0, "num_tokens": 12628458.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8780 }, { "completion_length": 19.3, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 16.85, "completions/mean_terminated_length": 16.85, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.6043729372937293, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.087851022928953, "learning_rate": 2.02909497689459e-06, "loss": 0.0, "num_tokens": 12641660.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8790 }, { "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 17.575, "completions/mean_terminated_length": 17.575, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.605060506050605, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2375859890133143, "learning_rate": 2.023203304550733e-06, "loss": 0.0, "num_tokens": 12655911.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8800 }, { "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 17.025, "completions/mean_terminated_length": 17.025, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.6057480748074807, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2029473409056664, "learning_rate": 2.0173143789613367e-06, "loss": 0.0, "num_tokens": 12670172.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8810 }, { "completion_length": 18.1, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.6064356435643564, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3563312634825706, "learning_rate": 2.011428234051619e-06, "loss": 0.0, "num_tokens": 12686014.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8820 }, { "completion_length": 18.6, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 17.4, "completions/mean_terminated_length": 17.4, "completions/min_length": 16.8, "completions/min_terminated_length": 16.8, "epoch": 0.6071232123212321, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0645937785506248, "learning_rate": 2.0055449037307794e-06, "loss": 0.0, "num_tokens": 12700878.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8830 }, { "completion_length": 19.9, "completions/clipped_ratio": 0.0, "completions/max_length": 19.9, "completions/max_terminated_length": 19.9, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.6078107810781078, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2846265882253647, "learning_rate": 1.9996644218918076e-06, "loss": 0.0, "num_tokens": 12714358.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8840 }, { "completion_length": 19.2, "completions/clipped_ratio": 0.0, "completions/max_length": 19.2, "completions/max_terminated_length": 19.2, "completions/mean_length": 16.925, "completions/mean_terminated_length": 16.925, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.6084983498349835, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.04482150322292, "learning_rate": 1.993786822411277e-06, "loss": 0.0, "num_tokens": 12729639.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8850 }, { "completion_length": 17.9, "completions/clipped_ratio": 0.0, "completions/max_length": 17.9, "completions/max_terminated_length": 17.9, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.6091859185918592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.125196693232283, "learning_rate": 1.987912139149159e-06, "loss": 0.0, "num_tokens": 12741891.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8860 }, { "completion_length": 22.3, "completions/clipped_ratio": 0.0, "completions/max_length": 22.3, "completions/max_terminated_length": 22.3, "completions/mean_length": 19.35, "completions/mean_terminated_length": 19.35, "completions/min_length": 17.4, "completions/min_terminated_length": 17.4, "epoch": 0.6098734873487349, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0696656912565232, "learning_rate": 1.982040405948628e-06, "loss": 0.0, "num_tokens": 12757917.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8870 }, { "completion_length": 21.1, "completions/clipped_ratio": 0.0, "completions/max_length": 21.1, "completions/max_terminated_length": 21.1, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 16.9, "completions/min_terminated_length": 16.9, "epoch": 0.6105610561056105, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1211222544312478, "learning_rate": 1.976171656635859e-06, "loss": 0.0, "num_tokens": 12773951.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8880 }, { "completion_length": 17.9, "completions/clipped_ratio": 0.0, "completions/max_length": 17.9, "completions/max_terminated_length": 17.9, "completions/mean_length": 16.8, "completions/mean_terminated_length": 16.8, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.6112486248624862, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.391440062969923, "learning_rate": 1.970305925019838e-06, "loss": 0.0, "num_tokens": 12787531.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8890 }, { "completion_length": 18.3, "completions/clipped_ratio": 0.0, "completions/max_length": 18.3, "completions/max_terminated_length": 18.3, "completions/mean_length": 16.925, "completions/mean_terminated_length": 16.925, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.6119361936193619, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.5265330266207457, "learning_rate": 1.9644432448921713e-06, "loss": 0.0001, "num_tokens": 12804424.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8900 }, { "completion_length": 18.4, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 16.8, "completions/mean_terminated_length": 16.8, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.6126237623762376, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3511225588619709, "learning_rate": 1.9585836500268797e-06, "loss": 0.0, "num_tokens": 12817824.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8910 }, { "completion_length": 16.9, "completions/clipped_ratio": 0.0, "completions/max_length": 16.9, "completions/max_terminated_length": 16.9, "completions/mean_length": 15.975, "completions/mean_terminated_length": 15.975, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.6133113311331133, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2415352180600165, "learning_rate": 1.9527271741802127e-06, "loss": 0.0, "num_tokens": 12832847.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8920 }, { "completion_length": 18.7, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 17.35, "completions/mean_terminated_length": 17.35, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.613998899889989, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2106421921402215, "learning_rate": 1.946873851090452e-06, "loss": 0.0, "num_tokens": 12847073.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8930 }, { "completion_length": 17.1, "completions/clipped_ratio": 0.0, "completions/max_length": 17.1, "completions/max_terminated_length": 17.1, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.6146864686468647, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1938246086239814, "learning_rate": 1.9410237144777185e-06, "loss": 0.0, "num_tokens": 12861929.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8940 }, { "completion_length": 19.6, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 17.175, "completions/mean_terminated_length": 17.175, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.6153740374037404, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0574672678485513, "learning_rate": 1.9351767980437726e-06, "loss": 0.0, "num_tokens": 12875812.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8950 }, { "completion_length": 16.3, "completions/clipped_ratio": 0.0, "completions/max_length": 16.3, "completions/max_terminated_length": 16.3, "completions/mean_length": 15.95, "completions/mean_terminated_length": 15.95, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.6160616061606161, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2055562317371369, "learning_rate": 1.9293331354718252e-06, "loss": 0.0, "num_tokens": 12890378.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8960 }, { "completion_length": 21.2, "completions/clipped_ratio": 0.0, "completions/max_length": 21.2, "completions/max_terminated_length": 21.2, "completions/mean_length": 18.775, "completions/mean_terminated_length": 18.775, "completions/min_length": 17.4, "completions/min_terminated_length": 17.4, "epoch": 0.6167491749174917, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8689388711005449, "learning_rate": 1.9234927604263437e-06, "loss": 0.0, "num_tokens": 12905593.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8970 }, { "completion_length": 17.6, "completions/clipped_ratio": 0.0, "completions/max_length": 17.6, "completions/max_terminated_length": 17.6, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.6174367436743674, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8989923447370529, "learning_rate": 1.9176557065528545e-06, "loss": 0.0, "num_tokens": 12919769.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8980 }, { "completion_length": 19.4, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 17.1, "completions/mean_terminated_length": 17.1, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.6181243124312431, "frac_reward_zero_std": 1.0, "grad_norm": 0.00011197543062735349, "kl": 1.0703674156218768, "learning_rate": 1.911822007477752e-06, "loss": 0.0, "num_tokens": 12934557.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 8990 }, { "completion_length": 20.9, "completions/clipped_ratio": 0.0, "completions/max_length": 20.9, "completions/max_terminated_length": 20.9, "completions/mean_length": 18.05, "completions/mean_terminated_length": 18.05, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.6188118811881188, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0797548450529575, "learning_rate": 1.9059916968081054e-06, "loss": 0.0, "num_tokens": 12948339.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9000 }, { "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.675, "completions/mean_terminated_length": 16.675, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.6194994499449945, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2672843962907792, "learning_rate": 1.9001648081314613e-06, "loss": 0.0, "num_tokens": 12962398.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9010 }, { "completion_length": 18.9, "completions/clipped_ratio": 0.0, "completions/max_length": 18.9, "completions/max_terminated_length": 18.9, "completions/mean_length": 17.7, "completions/mean_terminated_length": 17.7, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.6201870187018702, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.588840738683939, "learning_rate": 1.8943413750156539e-06, "loss": 0.0001, "num_tokens": 12974310.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9020 }, { "completion_length": 18.9, "completions/clipped_ratio": 0.0, "completions/max_length": 18.9, "completions/max_terminated_length": 18.9, "completions/mean_length": 17.1, "completions/mean_terminated_length": 17.1, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.6208745874587459, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.084335508197546, "learning_rate": 1.8885214310086115e-06, "loss": 0.0, "num_tokens": 12989318.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9030 }, { "completion_length": 16.9, "completions/clipped_ratio": 0.0, "completions/max_length": 16.9, "completions/max_terminated_length": 16.9, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.6215621562156216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.654494585096836, "learning_rate": 1.8827050096381604e-06, "loss": 0.0001, "num_tokens": 13002632.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9040 }, { "completion_length": 19.2, "completions/clipped_ratio": 0.0, "completions/max_length": 19.2, "completions/max_terminated_length": 19.2, "completions/mean_length": 17.6, "completions/mean_terminated_length": 17.6, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.6222497249724972, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9923084802925587, "learning_rate": 1.8768921444118343e-06, "loss": 0.0, "num_tokens": 13015952.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9050 }, { "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.65, "completions/mean_terminated_length": 16.65, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.6229372937293729, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0499162547290326, "learning_rate": 1.871082868816681e-06, "loss": 0.0, "num_tokens": 13029010.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9060 }, { "completion_length": 20.2, "completions/clipped_ratio": 0.0, "completions/max_length": 20.2, "completions/max_terminated_length": 20.2, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.6236248624862486, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2829386949539185, "learning_rate": 1.8652772163190683e-06, "loss": 0.0, "num_tokens": 13043842.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9070 }, { "completion_length": 20.1, "completions/clipped_ratio": 0.0, "completions/max_length": 20.1, "completions/max_terminated_length": 20.1, "completions/mean_length": 18.9, "completions/mean_terminated_length": 18.9, "completions/min_length": 17.9, "completions/min_terminated_length": 17.9, "epoch": 0.6243124312431243, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9651120055466891, "learning_rate": 1.8594752203644917e-06, "loss": 0.0, "num_tokens": 13058094.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9080 }, { "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.5, "completions/max_terminated_length": 17.5, "completions/mean_length": 16.925, "completions/mean_terminated_length": 16.925, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.625, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.181666761636734, "learning_rate": 1.853676914377382e-06, "loss": 0.0, "num_tokens": 13071599.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9090 }, { "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.5, "completions/max_terminated_length": 21.5, "completions/mean_length": 17.85, "completions/mean_terminated_length": 17.85, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.6256875687568757, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1490161709487439, "learning_rate": 1.8478823317609135e-06, "loss": 0.0, "num_tokens": 13087053.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9100 }, { "completion_length": 19.3, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.6263751375137514, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1776117444038392, "learning_rate": 1.8420915058968102e-06, "loss": 0.0, "num_tokens": 13102413.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9110 }, { "completion_length": 17.7, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 16.225, "completions/mean_terminated_length": 16.225, "completions/min_length": 14.8, "completions/min_terminated_length": 14.8, "epoch": 0.6270627062706271, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.052803972363472, "learning_rate": 1.8363044701451529e-06, "loss": 0.0, "num_tokens": 13114526.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9120 }, { "completion_length": 16.8, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.6277502750275028, "frac_reward_zero_std": 1.0, "grad_norm": 0.00811199564486742, "kl": 1.3392873076722025, "learning_rate": 1.8305212578441903e-06, "loss": 0.0, "num_tokens": 13128132.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9130 }, { "completion_length": 18.2, "completions/clipped_ratio": 0.0, "completions/max_length": 18.2, "completions/max_terminated_length": 18.2, "completions/mean_length": 16.8, "completions/mean_terminated_length": 16.8, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.6284378437843784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0594568440690637, "learning_rate": 1.824741902310143e-06, "loss": 0.0, "num_tokens": 13140800.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9140 }, { "completion_length": 17.4, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 16.425, "completions/mean_terminated_length": 16.425, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.6291254125412541, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.267071285843849, "learning_rate": 1.8189664368370135e-06, "loss": 0.0, "num_tokens": 13154541.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9150 }, { "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.55, "completions/mean_terminated_length": 17.55, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.6298129812981298, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9662587443366647, "learning_rate": 1.8131948946963961e-06, "loss": 0.0, "num_tokens": 13169771.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9160 }, { "completion_length": 17.3, "completions/clipped_ratio": 0.0, "completions/max_length": 17.3, "completions/max_terminated_length": 17.3, "completions/mean_length": 15.8, "completions/mean_terminated_length": 15.8, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.6305005500550055, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3340294376015662, "learning_rate": 1.8074273091372807e-06, "loss": 0.0, "num_tokens": 13183363.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9170 }, { "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.5, "completions/max_terminated_length": 17.5, "completions/mean_length": 16.775, "completions/mean_terminated_length": 16.775, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.6311881188118812, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.236559410393238, "learning_rate": 1.8016637133858655e-06, "loss": 0.0, "num_tokens": 13199438.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9180 }, { "completion_length": 18.1, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 16.225, "completions/mean_terminated_length": 16.225, "completions/min_length": 14.6, "completions/min_terminated_length": 14.6, "epoch": 0.6318756875687569, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.271244814991951, "learning_rate": 1.7959041406453637e-06, "loss": 0.0, "num_tokens": 13213775.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9190 }, { "completion_length": 17.4, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 16.475, "completions/mean_terminated_length": 16.475, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.6325632563256326, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3046999625861644, "learning_rate": 1.7901486240958132e-06, "loss": 0.0, "num_tokens": 13226738.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9200 }, { "completion_length": 17.6, "completions/clipped_ratio": 0.0, "completions/max_length": 17.6, "completions/max_terminated_length": 17.6, "completions/mean_length": 16.4, "completions/mean_terminated_length": 16.4, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.6332508250825083, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0818193051964045, "learning_rate": 1.7843971968938834e-06, "loss": 0.0, "num_tokens": 13241218.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9210 }, { "completion_length": 16.7, "completions/clipped_ratio": 0.0, "completions/max_length": 16.7, "completions/max_terminated_length": 16.7, "completions/mean_length": 15.65, "completions/mean_terminated_length": 15.65, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.6339383938393839, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4174569323658943, "learning_rate": 1.7786498921726856e-06, "loss": 0.0, "num_tokens": 13256152.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9220 }, { "completion_length": 16.7, "completions/clipped_ratio": 0.0, "completions/max_length": 16.7, "completions/max_terminated_length": 16.7, "completions/mean_length": 15.475, "completions/mean_terminated_length": 15.475, "completions/min_length": 14.8, "completions/min_terminated_length": 14.8, "epoch": 0.6346259625962596, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2436523552984, "learning_rate": 1.7729067430415842e-06, "loss": 0.0, "num_tokens": 13270503.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9230 }, { "completion_length": 19.8, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.6353135313531353, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1722389884293078, "learning_rate": 1.7671677825860024e-06, "loss": 0.0, "num_tokens": 13287583.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9240 }, { "completion_length": 17.6, "completions/clipped_ratio": 0.0, "completions/max_length": 17.6, "completions/max_terminated_length": 17.6, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.636001100110011, "frac_reward_zero_std": 1.0, "grad_norm": 2.4847417080309242e-05, "kl": 1.0514770017936825, "learning_rate": 1.7614330438672316e-06, "loss": 0.0, "num_tokens": 13302451.0, "reward": 4.01999990940094, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2699999570846559, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 1.95, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9250 }, { "completion_length": 18.2, "completions/clipped_ratio": 0.0, "completions/max_length": 18.2, "completions/max_terminated_length": 18.2, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.6366886688668867, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1103674413636326, "learning_rate": 1.7557025599222461e-06, "loss": 0.0, "num_tokens": 13315655.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9260 }, { "completion_length": 21.8, "completions/clipped_ratio": 0.0, "completions/max_length": 21.8, "completions/max_terminated_length": 21.8, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.6373762376237624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9410890098661184, "learning_rate": 1.7499763637635052e-06, "loss": 0.0, "num_tokens": 13330641.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9270 }, { "completion_length": 17.7, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 16.975, "completions/mean_terminated_length": 16.975, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.6380638063806381, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3103417098522185, "learning_rate": 1.7442544883787693e-06, "loss": 0.0, "num_tokens": 13344168.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9280 }, { "completion_length": 18.9, "completions/clipped_ratio": 0.0, "completions/max_length": 18.9, "completions/max_terminated_length": 18.9, "completions/mean_length": 17.275, "completions/mean_terminated_length": 17.275, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.6387513751375138, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2512700453400611, "learning_rate": 1.738536966730907e-06, "loss": 0.0, "num_tokens": 13359031.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9290 }, { "completion_length": 19.4, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 17.2, "completions/mean_terminated_length": 17.2, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.6394389438943895, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1582110799849032, "learning_rate": 1.7328238317577056e-06, "loss": 0.0, "num_tokens": 13372995.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9300 }, { "completion_length": 18.1, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 16.575, "completions/mean_terminated_length": 16.575, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.6401265126512651, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0892078332602977, "learning_rate": 1.7271151163716803e-06, "loss": 0.0, "num_tokens": 13388478.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9310 }, { "completion_length": 17.2, "completions/clipped_ratio": 0.0, "completions/max_length": 17.2, "completions/max_terminated_length": 17.2, "completions/mean_length": 15.725, "completions/mean_terminated_length": 15.725, "completions/min_length": 14.6, "completions/min_terminated_length": 14.6, "epoch": 0.6408140814081408, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "kl": 1.4367301620543003, "learning_rate": 1.7214108534598879e-06, "loss": 0.0, "num_tokens": 13403927.0, "reward": 4.087499904632568, "reward_std": 0.025, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.787500011920929, "rewards/quality_reward_func/std": 0.025, "step": 9320 }, { "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.5, "completions/max_terminated_length": 17.5, "completions/mean_length": 16.575, "completions/mean_terminated_length": 16.575, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.6415016501650165, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.463694727420807, "learning_rate": 1.7157110758837336e-06, "loss": 0.0, "num_tokens": 13418782.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9330 }, { "completion_length": 20.3, "completions/clipped_ratio": 0.0, "completions/max_length": 20.3, "completions/max_terminated_length": 20.3, "completions/mean_length": 18.325, "completions/mean_terminated_length": 18.325, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.6421892189218922, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2321791395545005, "learning_rate": 1.7100158164787828e-06, "loss": 0.0, "num_tokens": 13433759.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9340 }, { "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.775, "completions/mean_terminated_length": 17.775, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.6428767876787679, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.171649180725217, "learning_rate": 1.7043251080545747e-06, "loss": 0.0, "num_tokens": 13449162.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9350 }, { "completion_length": 19.1, "completions/clipped_ratio": 0.0, "completions/max_length": 19.1, "completions/max_terminated_length": 19.1, "completions/mean_length": 16.7, "completions/mean_terminated_length": 16.7, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.6435643564356436, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0836325135082006, "learning_rate": 1.6986389833944283e-06, "loss": 0.0, "num_tokens": 13464062.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9360 }, { "completion_length": 16.8, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 15.4, "completions/mean_terminated_length": 15.4, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.6442519251925193, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0904548369348048, "learning_rate": 1.6929574752552586e-06, "loss": 0.0, "num_tokens": 13479542.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9370 }, { "completion_length": 20.8, "completions/clipped_ratio": 0.0, "completions/max_length": 20.8, "completions/max_terminated_length": 20.8, "completions/mean_length": 18.125, "completions/mean_terminated_length": 18.125, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.644939493949395, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9454897038638592, "learning_rate": 1.6872806163673825e-06, "loss": 0.0, "num_tokens": 13493943.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9380 }, { "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 17.275, "completions/mean_terminated_length": 17.275, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.6456270627062707, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.5157184690237044, "learning_rate": 1.6816084394343368e-06, "loss": 0.0001, "num_tokens": 13508786.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9390 }, { "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.5, "completions/max_terminated_length": 21.5, "completions/mean_length": 17.625, "completions/mean_terminated_length": 17.625, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.6463146314631463, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.102421897649765, "learning_rate": 1.6759409771326843e-06, "loss": 0.0, "num_tokens": 13522687.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9400 }, { "completion_length": 19.8, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 18.3, "completions/mean_terminated_length": 18.3, "completions/min_length": 16.9, "completions/min_terminated_length": 16.9, "epoch": 0.647002200220022, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1165041968226432, "learning_rate": 1.6702782621118266e-06, "loss": 0.0, "num_tokens": 13539071.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9410 }, { "completion_length": 16.8, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 16.45, "completions/mean_terminated_length": 16.45, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.6476897689768977, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4156561493873596, "learning_rate": 1.6646203269938197e-06, "loss": 0.0, "num_tokens": 13554297.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9420 }, { "completion_length": 17.3, "completions/clipped_ratio": 0.0, "completions/max_length": 17.3, "completions/max_terminated_length": 17.3, "completions/mean_length": 16.175, "completions/mean_terminated_length": 16.175, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.6483773377337734, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3017088994383812, "learning_rate": 1.6589672043731809e-06, "loss": 0.0, "num_tokens": 13568904.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9430 }, { "completion_length": 20.4, "completions/clipped_ratio": 0.0, "completions/max_length": 20.4, "completions/max_terminated_length": 20.4, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.6490649064906491, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.091856675222516, "learning_rate": 1.6533189268167036e-06, "loss": 0.0, "num_tokens": 13583926.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9440 }, { "completion_length": 17.4, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 15.7, "completions/mean_terminated_length": 15.7, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.6497524752475248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3819396480917931, "learning_rate": 1.6476755268632713e-06, "loss": 0.0, "num_tokens": 13597438.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9450 }, { "completion_length": 18.6, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 15.8, "completions/mean_terminated_length": 15.8, "completions/min_length": 13.9, "completions/min_terminated_length": 13.9, "epoch": 0.6504400440044005, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0704050730913877, "learning_rate": 1.6420370370236666e-06, "loss": 0.0, "num_tokens": 13612506.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9460 }, { "completion_length": 16.6, "completions/clipped_ratio": 0.0, "completions/max_length": 16.6, "completions/max_terminated_length": 16.6, "completions/mean_length": 15.525, "completions/mean_terminated_length": 15.525, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.6511276127612762, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4664267927408219, "learning_rate": 1.636403489780386e-06, "loss": 0.0, "num_tokens": 13626531.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9470 }, { "completion_length": 18.8, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 17.325, "completions/mean_terminated_length": 17.325, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.6518151815181518, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1142109483480453, "learning_rate": 1.6307749175874532e-06, "loss": 0.0, "num_tokens": 13640416.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9480 }, { "completion_length": 17.1, "completions/clipped_ratio": 0.0, "completions/max_length": 17.1, "completions/max_terminated_length": 17.1, "completions/mean_length": 15.625, "completions/mean_terminated_length": 15.625, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.6525027502750275, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2092837691307068, "learning_rate": 1.625151352870231e-06, "loss": 0.0, "num_tokens": 13655137.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9490 }, { "completion_length": 21.3, "completions/clipped_ratio": 0.0, "completions/max_length": 21.3, "completions/max_terminated_length": 21.3, "completions/mean_length": 19.35, "completions/mean_terminated_length": 19.35, "completions/min_length": 17.7, "completions/min_terminated_length": 17.7, "epoch": 0.6531903190319032, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1527700431644916, "learning_rate": 1.6195328280252337e-06, "loss": 0.0, "num_tokens": 13670011.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9500 }, { "completion_length": 15.8, "completions/clipped_ratio": 0.0, "completions/max_length": 15.8, "completions/max_terminated_length": 15.8, "completions/mean_length": 14.175, "completions/mean_terminated_length": 14.175, "completions/min_length": 13.3, "completions/min_terminated_length": 13.3, "epoch": 0.6538778877887789, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.372855794429779, "learning_rate": 1.613919375419944e-06, "loss": 0.0, "num_tokens": 13683766.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9510 }, { "completion_length": 19.8, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.6545654565456546, "frac_reward_zero_std": 1.0, "grad_norm": 0.00016407572547905147, "kl": 1.2331990785896778, "learning_rate": 1.6083110273926222e-06, "loss": 0.0, "num_tokens": 13697930.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9520 }, { "completion_length": 19.1, "completions/clipped_ratio": 0.0, "completions/max_length": 19.1, "completions/max_terminated_length": 19.1, "completions/mean_length": 17.775, "completions/mean_terminated_length": 17.775, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.6552530253025303, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.284055880457163, "learning_rate": 1.6027078162521235e-06, "loss": 0.0, "num_tokens": 13711845.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9530 }, { "completion_length": 19.6, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 16.875, "completions/mean_terminated_length": 16.875, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.655940594059406, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.996816660699551, "learning_rate": 1.5971097742777063e-06, "loss": 0.0, "num_tokens": 13725672.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9540 }, { "completion_length": 15.8, "completions/clipped_ratio": 0.0, "completions/max_length": 15.8, "completions/max_terminated_length": 15.8, "completions/mean_length": 14.85, "completions/mean_terminated_length": 14.85, "completions/min_length": 14.2, "completions/min_terminated_length": 14.2, "epoch": 0.6566281628162817, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9357448853552341, "learning_rate": 1.5915169337188574e-06, "loss": 0.0, "num_tokens": 13738770.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9550 }, { "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 17.35, "completions/mean_terminated_length": 17.35, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.6573157315731574, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7911081623286009, "learning_rate": 1.5859293267950929e-06, "loss": 0.0, "num_tokens": 13752488.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9560 }, { "completion_length": 19.8, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 17.825, "completions/mean_terminated_length": 17.825, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.658003300330033, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.933712400868535, "learning_rate": 1.5803469856957793e-06, "loss": 0.0, "num_tokens": 13764529.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9570 }, { "completion_length": 16.4, "completions/clipped_ratio": 0.0, "completions/max_length": 16.4, "completions/max_terminated_length": 16.4, "completions/mean_length": 15.775, "completions/mean_terminated_length": 15.775, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.6586908690869087, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1759530864655972, "learning_rate": 1.5747699425799511e-06, "loss": 0.0, "num_tokens": 13779596.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9580 }, { "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.425, "completions/mean_terminated_length": 18.425, "completions/min_length": 17.2, "completions/min_terminated_length": 17.2, "epoch": 0.6593784378437844, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9723652474582195, "learning_rate": 1.5691982295761196e-06, "loss": 0.0, "num_tokens": 13795029.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9590 }, { "completion_length": 17.4, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 15.925, "completions/mean_terminated_length": 15.925, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.6600660066006601, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8927717017009854, "learning_rate": 1.563631878782088e-06, "loss": 0.0, "num_tokens": 13807122.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9600 }, { "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.5, "completions/max_terminated_length": 16.5, "completions/mean_length": 15.55, "completions/mean_terminated_length": 15.55, "completions/min_length": 14.4, "completions/min_terminated_length": 14.4, "epoch": 0.6607535753575358, "frac_reward_zero_std": 1.0, "grad_norm": 0.00017550538177601993, "kl": 0.9620685562491417, "learning_rate": 1.5580709222647727e-06, "loss": 0.0, "num_tokens": 13821148.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9610 }, { "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.6614411441144115, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3227898553013802, "learning_rate": 1.5525153920600132e-06, "loss": 0.0, "num_tokens": 13835614.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9620 }, { "completion_length": 19.7, "completions/clipped_ratio": 0.0, "completions/max_length": 19.7, "completions/max_terminated_length": 19.7, "completions/mean_length": 18.125, "completions/mean_terminated_length": 18.125, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.6621287128712872, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.885895474255085, "learning_rate": 1.5469653201723853e-06, "loss": 0.0, "num_tokens": 13849415.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9630 }, { "completion_length": 19.4, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 17.85, "completions/mean_terminated_length": 17.85, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.6628162816281629, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1234379019588232, "learning_rate": 1.541420738575026e-06, "loss": 0.0, "num_tokens": 13862733.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9640 }, { "completion_length": 19.1, "completions/clipped_ratio": 0.0, "completions/max_length": 19.1, "completions/max_terminated_length": 19.1, "completions/mean_length": 17.375, "completions/mean_terminated_length": 17.375, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.6635038503850385, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9037781897000968, "learning_rate": 1.5358816792094402e-06, "loss": 0.0, "num_tokens": 13876900.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9650 }, { "completion_length": 19.4, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.6641914191419142, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7881045371294022, "learning_rate": 1.5303481739853193e-06, "loss": 0.0, "num_tokens": 13891836.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9660 }, { "completion_length": 19.2, "completions/clipped_ratio": 0.0, "completions/max_length": 19.2, "completions/max_terminated_length": 19.2, "completions/mean_length": 17.2, "completions/mean_terminated_length": 17.2, "completions/min_length": 14.3, "completions/min_terminated_length": 14.3, "epoch": 0.6648789878987899, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.994379597902298, "learning_rate": 1.5248202547803592e-06, "loss": 0.0, "num_tokens": 13908216.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9670 }, { "completion_length": 19.3, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 17.575, "completions/mean_terminated_length": 17.575, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.6655665566556656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9605912856757641, "learning_rate": 1.5192979534400784e-06, "loss": 0.0, "num_tokens": 13920951.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9680 }, { "completion_length": 18.1, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 16.05, "completions/mean_terminated_length": 16.05, "completions/min_length": 14.4, "completions/min_terminated_length": 14.4, "epoch": 0.6662541254125413, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1505890741944313, "learning_rate": 1.5137813017776265e-06, "loss": 0.0, "num_tokens": 13933985.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9690 }, { "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.5, "completions/max_terminated_length": 16.5, "completions/mean_length": 15.225, "completions/mean_terminated_length": 15.225, "completions/min_length": 14.3, "completions/min_terminated_length": 14.3, "epoch": 0.666941694169417, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.102402663975954, "learning_rate": 1.5082703315736089e-06, "loss": 0.0, "num_tokens": 13949678.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9700 }, { "completion_length": 20.7, "completions/clipped_ratio": 0.0, "completions/max_length": 20.7, "completions/max_terminated_length": 20.7, "completions/mean_length": 18.9, "completions/mean_terminated_length": 18.9, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.6676292629262927, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9015872672200202, "learning_rate": 1.5027650745759025e-06, "loss": 0.0, "num_tokens": 13961326.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9710 }, { "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.625, "completions/mean_terminated_length": 17.625, "completions/min_length": 16.8, "completions/min_terminated_length": 16.8, "epoch": 0.6683168316831684, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1481263168156146, "learning_rate": 1.4972655624994687e-06, "loss": 0.0, "num_tokens": 13975423.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9720 }, { "completion_length": 19.6, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 18.225, "completions/mean_terminated_length": 18.225, "completions/min_length": 16.9, "completions/min_terminated_length": 16.9, "epoch": 0.6690044004400441, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.088440278172493, "learning_rate": 1.491771827026172e-06, "loss": 0.0, "num_tokens": 13989612.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9730 }, { "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.5, "completions/max_terminated_length": 17.5, "completions/mean_length": 16.025, "completions/mean_terminated_length": 16.025, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.6696919691969196, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.092487622052431, "learning_rate": 1.4862838998046053e-06, "loss": 0.0, "num_tokens": 14002629.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9740 }, { "completion_length": 21.2, "completions/clipped_ratio": 0.0, "completions/max_length": 21.2, "completions/max_terminated_length": 21.2, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.6703795379537953, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2421228557825088, "learning_rate": 1.4808018124498936e-06, "loss": 0.0001, "num_tokens": 14015143.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9750 }, { "completion_length": 16.9, "completions/clipped_ratio": 0.0, "completions/max_length": 16.9, "completions/max_terminated_length": 16.9, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.671067106710671, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1889532297849654, "learning_rate": 1.4753255965435215e-06, "loss": 0.0, "num_tokens": 14029447.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9760 }, { "completion_length": 16.8, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 15.425, "completions/mean_terminated_length": 15.425, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.6717546754675467, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3890325456857682, "learning_rate": 1.4698552836331529e-06, "loss": 0.0, "num_tokens": 14041704.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9770 }, { "completion_length": 17.2, "completions/clipped_ratio": 0.0, "completions/max_length": 17.2, "completions/max_terminated_length": 17.2, "completions/mean_length": 16.3, "completions/mean_terminated_length": 16.3, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.6724422442244224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3276241544634104, "learning_rate": 1.4643909052324403e-06, "loss": 0.0, "num_tokens": 14057100.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9780 }, { "completion_length": 17.6, "completions/clipped_ratio": 0.0, "completions/max_length": 17.6, "completions/max_terminated_length": 17.6, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.6731298129812981, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2909520849585534, "learning_rate": 1.4589324928208501e-06, "loss": 0.0, "num_tokens": 14072160.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9790 }, { "completion_length": 21.4, "completions/clipped_ratio": 0.0, "completions/max_length": 21.4, "completions/max_terminated_length": 21.4, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.6738173817381738, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0208387814462185, "learning_rate": 1.4534800778434824e-06, "loss": 0.0, "num_tokens": 14085840.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9800 }, { "completion_length": 18.4, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 16.275, "completions/mean_terminated_length": 16.275, "completions/min_length": 14.8, "completions/min_terminated_length": 14.8, "epoch": 0.6745049504950495, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2437881991267203, "learning_rate": 1.4480336917108818e-06, "loss": 0.0, "num_tokens": 14100691.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9810 }, { "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.925, "completions/mean_terminated_length": 17.925, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.6751925192519251, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9875382862985134, "learning_rate": 1.4425933657988663e-06, "loss": 0.0, "num_tokens": 14115768.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9820 }, { "completion_length": 16.8, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.6758800880088008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8986855648458004, "learning_rate": 1.4371591314483384e-06, "loss": 0.0, "num_tokens": 14129856.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9830 }, { "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 17.075, "completions/mean_terminated_length": 17.075, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.6765676567656765, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.582784863561392, "learning_rate": 1.43173101996511e-06, "loss": 0.0001, "num_tokens": 14142263.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9840 }, { "completion_length": 19.2, "completions/clipped_ratio": 0.0, "completions/max_length": 19.2, "completions/max_terminated_length": 19.2, "completions/mean_length": 17.775, "completions/mean_terminated_length": 17.775, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.6772552255225522, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.6635506592690945, "learning_rate": 1.4263090626197207e-06, "loss": 0.0001, "num_tokens": 14154430.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9850 }, { "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.6779427942794279, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.271998842060566, "learning_rate": 1.420893290647254e-06, "loss": 0.0, "num_tokens": 14167132.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9860 }, { "completion_length": 16.8, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 14.9, "completions/mean_terminated_length": 14.9, "completions/min_length": 14.1, "completions/min_terminated_length": 14.1, "epoch": 0.6786303630363036, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2665555611252786, "learning_rate": 1.4154837352471632e-06, "loss": 0.0, "num_tokens": 14182980.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9870 }, { "completion_length": 19.2, "completions/clipped_ratio": 0.0, "completions/max_length": 19.2, "completions/max_terminated_length": 19.2, "completions/mean_length": 17.675, "completions/mean_terminated_length": 17.675, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.6793179317931793, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9682081639766693, "learning_rate": 1.4100804275830893e-06, "loss": 0.0, "num_tokens": 14196795.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9880 }, { "completion_length": 17.8, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 15.975, "completions/mean_terminated_length": 15.975, "completions/min_length": 14.6, "completions/min_terminated_length": 14.6, "epoch": 0.680005500550055, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3797599226236343, "learning_rate": 1.4046833987826775e-06, "loss": 0.0, "num_tokens": 14211750.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9890 }, { "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.6806930693069307, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9193771593272686, "learning_rate": 1.3992926799374045e-06, "loss": 0.0, "num_tokens": 14225318.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9900 }, { "completion_length": 20.2, "completions/clipped_ratio": 0.0, "completions/max_length": 20.2, "completions/max_terminated_length": 20.2, "completions/mean_length": 17.725, "completions/mean_terminated_length": 17.725, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.6813806380638063, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0620156854391098, "learning_rate": 1.3939083021023963e-06, "loss": 0.0, "num_tokens": 14239175.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9910 }, { "completion_length": 18.2, "completions/clipped_ratio": 0.0, "completions/max_length": 18.2, "completions/max_terminated_length": 18.2, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.682068206820682, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3798893518745898, "learning_rate": 1.3885302962962471e-06, "loss": 0.0, "num_tokens": 14253981.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9920 }, { "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.975, "completions/mean_terminated_length": 15.975, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.6827557755775577, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2877103090286255, "learning_rate": 1.3831586935008442e-06, "loss": 0.0, "num_tokens": 14267000.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9930 }, { "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.925, "completions/mean_terminated_length": 15.925, "completions/min_length": 14.3, "completions/min_terminated_length": 14.3, "epoch": 0.6834433443344334, "frac_reward_zero_std": 1.0, "grad_norm": 4.430581975611858e-05, "kl": 1.061253872513771, "learning_rate": 1.3777935246611894e-06, "loss": 0.0, "num_tokens": 14281445.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9940 }, { "completion_length": 20.9, "completions/clipped_ratio": 0.0, "completions/max_length": 20.9, "completions/max_terminated_length": 20.9, "completions/mean_length": 18.175, "completions/mean_terminated_length": 18.175, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.6841309130913091, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3096635900437832, "learning_rate": 1.3724348206852165e-06, "loss": 0.0001, "num_tokens": 14297116.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9950 }, { "completion_length": 17.8, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 16.65, "completions/mean_terminated_length": 16.65, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.6848184818481848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1039109639823437, "learning_rate": 1.367082612443618e-06, "loss": 0.0, "num_tokens": 14308358.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9960 }, { "completion_length": 18.3, "completions/clipped_ratio": 0.0, "completions/max_length": 18.3, "completions/max_terminated_length": 18.3, "completions/mean_length": 16.025, "completions/mean_terminated_length": 16.025, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.6855060506050605, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.123747420310974, "learning_rate": 1.3617369307696666e-06, "loss": 0.0, "num_tokens": 14323491.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9970 }, { "completion_length": 18.1, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 16.425, "completions/mean_terminated_length": 16.425, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.6861936193619362, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9430355286225677, "learning_rate": 1.3563978064590335e-06, "loss": 0.0, "num_tokens": 14339028.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9980 }, { "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 18.4, "completions/mean_terminated_length": 18.4, "completions/min_length": 16.8, "completions/min_terminated_length": 16.8, "epoch": 0.6868811881188119, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0795041255652904, "learning_rate": 1.3510652702696139e-06, "loss": 0.0, "num_tokens": 14354132.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 9990 }, { "completion_length": 17.8, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 16.55, "completions/mean_terminated_length": 16.55, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.6875687568756875, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0660839565098286, "learning_rate": 1.3457393529213553e-06, "loss": 0.0, "num_tokens": 14368334.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10000 }, { "completion_length": 17.8, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 16.6, "completions/mean_terminated_length": 16.6, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.6882563256325632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8901675894856453, "learning_rate": 1.3404200850960688e-06, "loss": 0.0, "num_tokens": 14382166.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10010 }, { "completion_length": 18.6, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 16.875, "completions/mean_terminated_length": 16.875, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.6889438943894389, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1154540177434682, "learning_rate": 1.335107497437259e-06, "loss": 0.0, "num_tokens": 14396041.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10020 }, { "completion_length": 20.8, "completions/clipped_ratio": 0.0, "completions/max_length": 20.8, "completions/max_terminated_length": 20.8, "completions/mean_length": 18.225, "completions/mean_terminated_length": 18.225, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.6896314631463146, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1065445344895124, "learning_rate": 1.3298016205499532e-06, "loss": 0.0, "num_tokens": 14413230.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10030 }, { "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.5, "completions/max_terminated_length": 16.5, "completions/mean_length": 15.725, "completions/mean_terminated_length": 15.725, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.6903190319031903, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2713998608291148, "learning_rate": 1.3245024850005122e-06, "loss": 0.0, "num_tokens": 14428363.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10040 }, { "completion_length": 19.3, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 17.225, "completions/mean_terminated_length": 17.225, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.691006600660066, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0248003728687762, "learning_rate": 1.3192101213164614e-06, "loss": 0.0, "num_tokens": 14443696.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10050 }, { "completion_length": 18.9, "completions/clipped_ratio": 0.0, "completions/max_length": 18.9, "completions/max_terminated_length": 18.9, "completions/mean_length": 17.175, "completions/mean_terminated_length": 17.175, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.6916941694169417, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0025110706686973, "learning_rate": 1.3139245599863215e-06, "loss": 0.0, "num_tokens": 14455739.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10060 }, { "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.475, "completions/mean_terminated_length": 17.475, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.6923817381738174, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1045747801661492, "learning_rate": 1.3086458314594175e-06, "loss": 0.0, "num_tokens": 14470178.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10070 }, { "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.5, "completions/max_terminated_length": 17.5, "completions/mean_length": 16.775, "completions/mean_terminated_length": 16.775, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.693069306930693, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0640204057097435, "learning_rate": 1.303373966145714e-06, "loss": 0.0, "num_tokens": 14487325.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10080 }, { "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.5, "completions/max_terminated_length": 17.5, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.6937568756875687, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.6342476584017276, "learning_rate": 1.2981089944156422e-06, "loss": 0.0001, "num_tokens": 14503209.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10090 }, { "completion_length": 19.8, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 17.8, "completions/mean_terminated_length": 17.8, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.6944444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1191297084093095, "learning_rate": 1.292850946599915e-06, "loss": 0.0, "num_tokens": 14518209.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10100 }, { "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 16.925, "completions/mean_terminated_length": 16.925, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.6951320132013201, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.116189180314541, "learning_rate": 1.2875998529893591e-06, "loss": 0.0, "num_tokens": 14529662.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10110 }, { "completion_length": 19.2, "completions/clipped_ratio": 0.0, "completions/max_length": 19.2, "completions/max_terminated_length": 19.2, "completions/mean_length": 17.175, "completions/mean_terminated_length": 17.175, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.6958195819581958, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3250300783663989, "learning_rate": 1.2823557438347394e-06, "loss": 0.0001, "num_tokens": 14547361.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10120 }, { "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.175, "completions/mean_terminated_length": 15.175, "completions/min_length": 14.4, "completions/min_terminated_length": 14.4, "epoch": 0.6965071507150715, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3464110165834426, "learning_rate": 1.2771186493465864e-06, "loss": 0.0, "num_tokens": 14562364.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10130 }, { "completion_length": 16.9, "completions/clipped_ratio": 0.0, "completions/max_length": 16.9, "completions/max_terminated_length": 16.9, "completions/mean_length": 15.475, "completions/mean_terminated_length": 15.475, "completions/min_length": 14.4, "completions/min_terminated_length": 14.4, "epoch": 0.6971947194719472, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1328261971473694, "learning_rate": 1.2718885996950158e-06, "loss": 0.0, "num_tokens": 14574915.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10140 }, { "completion_length": 19.4, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 17.725, "completions/mean_terminated_length": 17.725, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.6978822882288229, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4121058017015458, "learning_rate": 1.266665625009563e-06, "loss": 0.0001, "num_tokens": 14587588.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10150 }, { "completion_length": 18.857142857142858, "completions/clipped_ratio": 0.0, "completions/max_length": 18.857142857142858, "completions/max_terminated_length": 18.857142857142858, "completions/mean_length": 17.196428571428573, "completions/mean_terminated_length": 17.196428571428573, "completions/min_length": 15.428571428571429, "completions/min_terminated_length": 15.428571428571429, "epoch": 0.6985698569856986, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4184592621667045, "learning_rate": 1.2614497553790045e-06, "loss": 0.0, "num_tokens": 14603282.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10160 }, { "completion_length": 20.1, "completions/clipped_ratio": 0.0, "completions/max_length": 20.1, "completions/max_terminated_length": 20.1, "completions/mean_length": 17.7, "completions/mean_terminated_length": 17.7, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.6992574257425742, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0109045282006264, "learning_rate": 1.2562410208511835e-06, "loss": 0.0, "num_tokens": 14615646.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10170 }, { "completion_length": 17.8, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 16.85, "completions/mean_terminated_length": 16.85, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.6999449944994499, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3745124503970145, "learning_rate": 1.2510394514328417e-06, "loss": 0.0, "num_tokens": 14627912.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10180 }, { "completion_length": 19.2, "completions/clipped_ratio": 0.0, "completions/max_length": 19.2, "completions/max_terminated_length": 19.2, "completions/mean_length": 17.55, "completions/mean_terminated_length": 17.55, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.7006325632563256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8388098359107972, "learning_rate": 1.2458450770894434e-06, "loss": 0.0, "num_tokens": 14642790.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10190 }, { "completion_length": 19.4, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 17.6, "completions/mean_terminated_length": 17.6, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.7013201320132013, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.089323093742132, "learning_rate": 1.2406579277450008e-06, "loss": 0.0, "num_tokens": 14658538.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10200 }, { "completion_length": 17.2, "completions/clipped_ratio": 0.0, "completions/max_length": 17.2, "completions/max_terminated_length": 17.2, "completions/mean_length": 16.225, "completions/mean_terminated_length": 16.225, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.702007700770077, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2596516642719506, "learning_rate": 1.2354780332819064e-06, "loss": 0.0, "num_tokens": 14673923.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10210 }, { "completion_length": 19.4, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 16.7, "completions/mean_terminated_length": 16.7, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.7026952695269527, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.279405841231346, "learning_rate": 1.2303054235407583e-06, "loss": 0.0, "num_tokens": 14690127.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10220 }, { "completion_length": 18.7, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 16.475, "completions/mean_terminated_length": 16.475, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.7033828382838284, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1789217442274094, "learning_rate": 1.225140128320186e-06, "loss": 0.0, "num_tokens": 14706774.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10230 }, { "completion_length": 17.1, "completions/clipped_ratio": 0.0, "completions/max_length": 17.1, "completions/max_terminated_length": 17.1, "completions/mean_length": 15.625, "completions/mean_terminated_length": 15.625, "completions/min_length": 14.8, "completions/min_terminated_length": 14.8, "epoch": 0.7040704070407041, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.6259630009531976, "learning_rate": 1.2199821773766838e-06, "loss": 0.0001, "num_tokens": 14722671.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10240 }, { "completion_length": 19.4, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 18.65, "completions/mean_terminated_length": 18.65, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.7047579757975797, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0859208636917175, "learning_rate": 1.2148316004244364e-06, "loss": 0.0, "num_tokens": 14733669.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10250 }, { "completion_length": 17.2, "completions/clipped_ratio": 0.0, "completions/max_length": 17.2, "completions/max_terminated_length": 17.2, "completions/mean_length": 15.35, "completions/mean_terminated_length": 15.35, "completions/min_length": 14.1, "completions/min_terminated_length": 14.1, "epoch": 0.7054455445544554, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.263043588027358, "learning_rate": 1.2096884271351455e-06, "loss": 0.0, "num_tokens": 14749239.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10260 }, { "completion_length": 17.2, "completions/clipped_ratio": 0.0, "completions/max_length": 17.2, "completions/max_terminated_length": 17.2, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.7061331133113311, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.286349619552493, "learning_rate": 1.2045526871378657e-06, "loss": 0.0, "num_tokens": 14766135.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10270 }, { "completion_length": 19.3, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 17.8, "completions/mean_terminated_length": 17.8, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.7068206820682068, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3605617001652717, "learning_rate": 1.1994244100188246e-06, "loss": 0.0001, "num_tokens": 14779651.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10280 }, { "completion_length": 20.1, "completions/clipped_ratio": 0.0, "completions/max_length": 20.1, "completions/max_terminated_length": 20.1, "completions/mean_length": 17.45, "completions/mean_terminated_length": 17.45, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.7075082508250825, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0575031220912934, "learning_rate": 1.1943036253212612e-06, "loss": 0.0, "num_tokens": 14794241.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10290 }, { "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.825, "completions/mean_terminated_length": 18.825, "completions/min_length": 16.8, "completions/min_terminated_length": 16.8, "epoch": 0.7081958195819582, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9313926883041859, "learning_rate": 1.1891903625452505e-06, "loss": 0.0, "num_tokens": 14809902.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10300 }, { "completion_length": 19.6, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 16.875, "completions/mean_terminated_length": 16.875, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.7088833883388339, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9877796759828925, "learning_rate": 1.1840846511475338e-06, "loss": 0.0, "num_tokens": 14822749.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10310 }, { "completion_length": 18.7, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 16.875, "completions/mean_terminated_length": 16.875, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.7095709570957096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.033161374181509, "learning_rate": 1.1789865205413511e-06, "loss": 0.0, "num_tokens": 14836880.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10320 }, { "completion_length": 18.4, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 17.275, "completions/mean_terminated_length": 17.275, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.7102585258525853, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4701104439795016, "learning_rate": 1.1738960000962719e-06, "loss": 0.0, "num_tokens": 14851759.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10330 }, { "completion_length": 18.2, "completions/clipped_ratio": 0.0, "completions/max_length": 18.2, "completions/max_terminated_length": 18.2, "completions/mean_length": 16.9, "completions/mean_terminated_length": 16.9, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.7109460946094609, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2840877197682858, "learning_rate": 1.1688131191380208e-06, "loss": 0.0, "num_tokens": 14866111.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10340 }, { "completion_length": 17.8, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 16.425, "completions/mean_terminated_length": 16.425, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.7116336633663366, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1034896306693553, "learning_rate": 1.1637379069483163e-06, "loss": 0.0, "num_tokens": 14880280.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10350 }, { "completion_length": 18.4, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.7123212321232123, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3295586209744215, "learning_rate": 1.1586703927646975e-06, "loss": 0.0, "num_tokens": 14893224.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10360 }, { "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.1, "completions/mean_terminated_length": 16.1, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.713008800880088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7796104542911053, "learning_rate": 1.153610605780354e-06, "loss": 0.0, "num_tokens": 14907924.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10370 }, { "completion_length": 17.8, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 16.775, "completions/mean_terminated_length": 16.775, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.7136963696369637, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2483473094180226, "learning_rate": 1.1485585751439626e-06, "loss": 0.0, "num_tokens": 14921291.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10380 }, { "completion_length": 18.6, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 16.925, "completions/mean_terminated_length": 16.925, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.7143839383938394, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.23840821236372, "learning_rate": 1.1435143299595178e-06, "loss": 0.0, "num_tokens": 14936652.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10390 }, { "completion_length": 17.6, "completions/clipped_ratio": 0.0, "completions/max_length": 17.6, "completions/max_terminated_length": 17.6, "completions/mean_length": 16.45, "completions/mean_terminated_length": 16.45, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.7150715071507151, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1562462132424116, "learning_rate": 1.1384778992861595e-06, "loss": 0.0, "num_tokens": 14947530.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10400 }, { "completion_length": 20.3, "completions/clipped_ratio": 0.0, "completions/max_length": 20.3, "completions/max_terminated_length": 20.3, "completions/mean_length": 16.8, "completions/mean_terminated_length": 16.8, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.7157590759075908, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9479629255831241, "learning_rate": 1.1334493121380128e-06, "loss": 0.0, "num_tokens": 14961350.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10410 }, { "completion_length": 18.2, "completions/clipped_ratio": 0.0, "completions/max_length": 18.2, "completions/max_terminated_length": 18.2, "completions/mean_length": 16.65, "completions/mean_terminated_length": 16.65, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.7164466446644664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3499415889382362, "learning_rate": 1.1284285974840168e-06, "loss": 0.0, "num_tokens": 14973636.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10420 }, { "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.475, "completions/mean_terminated_length": 18.475, "completions/min_length": 16.9, "completions/min_terminated_length": 16.9, "epoch": 0.7171342134213421, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0426813304424285, "learning_rate": 1.123415784247756e-06, "loss": 0.0, "num_tokens": 14987143.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10430 }, { "completion_length": 18.6, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 16.3, "completions/mean_terminated_length": 16.3, "completions/min_length": 13.9, "completions/min_terminated_length": 13.9, "epoch": 0.7178217821782178, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3114049814641475, "learning_rate": 1.1184109013072991e-06, "loss": 0.0, "num_tokens": 14999215.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10440 }, { "completion_length": 20.2, "completions/clipped_ratio": 0.0, "completions/max_length": 20.2, "completions/max_terminated_length": 20.2, "completions/mean_length": 18.925, "completions/mean_terminated_length": 18.925, "completions/min_length": 17.6, "completions/min_terminated_length": 17.6, "epoch": 0.7185093509350935, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.124692279472947, "learning_rate": 1.113413977495029e-06, "loss": 0.0, "num_tokens": 15013088.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10450 }, { "completion_length": 21.7, "completions/clipped_ratio": 0.0, "completions/max_length": 21.7, "completions/max_terminated_length": 21.7, "completions/mean_length": 18.6, "completions/mean_terminated_length": 18.6, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.7191969196919692, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8995658230036497, "learning_rate": 1.1084250415974745e-06, "loss": 0.0, "num_tokens": 15025608.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10460 }, { "completion_length": 16.7, "completions/clipped_ratio": 0.0, "completions/max_length": 16.7, "completions/max_terminated_length": 16.7, "completions/mean_length": 15.225, "completions/mean_terminated_length": 15.225, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.7198844884488449, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2768886677920819, "learning_rate": 1.1034441223551504e-06, "loss": 0.0, "num_tokens": 15040209.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10470 }, { "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.65, "completions/mean_terminated_length": 16.65, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.7205720572057206, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2183129861950874, "learning_rate": 1.0984712484623883e-06, "loss": 0.0, "num_tokens": 15054531.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10480 }, { "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 17.075, "completions/mean_terminated_length": 17.075, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.7212596259625963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1540740311145783, "learning_rate": 1.093506448567169e-06, "loss": 0.0, "num_tokens": 15067354.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10490 }, { "completion_length": 19.4, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 17.025, "completions/mean_terminated_length": 17.025, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.721947194719472, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1697527408599853, "learning_rate": 1.0885497512709637e-06, "loss": 0.0, "num_tokens": 15081587.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10500 }, { "completion_length": 21.6, "completions/clipped_ratio": 0.0, "completions/max_length": 21.6, "completions/max_terminated_length": 21.6, "completions/mean_length": 18.575, "completions/mean_terminated_length": 18.575, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.7226347634763476, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7482544800266624, "learning_rate": 1.083601185128565e-06, "loss": 0.0, "num_tokens": 15095026.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10510 }, { "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.7233223322332233, "frac_reward_zero_std": 1.0, "grad_norm": 7.430253754137084e-05, "kl": 1.18574844263494, "learning_rate": 1.0786607786479202e-06, "loss": 0.0, "num_tokens": 15110380.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10520 }, { "completion_length": 19.3, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 17.2, "completions/mean_terminated_length": 17.2, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.724009900990099, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.192217667400837, "learning_rate": 1.0737285602899735e-06, "loss": 0.0, "num_tokens": 15125368.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10530 }, { "completion_length": 15.8, "completions/clipped_ratio": 0.0, "completions/max_length": 15.8, "completions/max_terminated_length": 15.8, "completions/mean_length": 15.075, "completions/mean_terminated_length": 15.075, "completions/min_length": 14.6, "completions/min_terminated_length": 14.6, "epoch": 0.7246974697469747, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.345651065558195, "learning_rate": 1.068804558468498e-06, "loss": 0.0, "num_tokens": 15140779.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10540 }, { "completion_length": 20.2, "completions/clipped_ratio": 0.0, "completions/max_length": 20.2, "completions/max_terminated_length": 20.2, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.9, "completions/min_terminated_length": 16.9, "epoch": 0.7253850385038504, "frac_reward_zero_std": 1.0, "grad_norm": 0.00010292514343746006, "kl": 1.383751429617405, "learning_rate": 1.0638888015499302e-06, "loss": 0.0001, "num_tokens": 15156579.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10550 }, { "completion_length": 21.2, "completions/clipped_ratio": 0.0, "completions/max_length": 21.2, "completions/max_terminated_length": 21.2, "completions/mean_length": 18.15, "completions/mean_terminated_length": 18.15, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.7260726072607261, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.043779155611992, "learning_rate": 1.058981317853212e-06, "loss": 0.0, "num_tokens": 15171977.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10560 }, { "completion_length": 19.1, "completions/clipped_ratio": 0.0, "completions/max_length": 19.1, "completions/max_terminated_length": 19.1, "completions/mean_length": 18.275, "completions/mean_terminated_length": 18.275, "completions/min_length": 17.3, "completions/min_terminated_length": 17.3, "epoch": 0.7267601760176018, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1929756373167038, "learning_rate": 1.054082135649622e-06, "loss": 0.0, "num_tokens": 15185172.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10570 }, { "completion_length": 18.4, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 16.625, "completions/mean_terminated_length": 16.625, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.7274477447744775, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0113733187317848, "learning_rate": 1.0491912831626164e-06, "loss": 0.0, "num_tokens": 15198205.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10580 }, { "completion_length": 21.8, "completions/clipped_ratio": 0.0, "completions/max_length": 21.8, "completions/max_terminated_length": 21.8, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.7281353135313532, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9798096269369125, "learning_rate": 1.0443087885676658e-06, "loss": 0.0, "num_tokens": 15215453.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10590 }, { "completion_length": 18.8, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 16.325, "completions/mean_terminated_length": 16.325, "completions/min_length": 14.4, "completions/min_terminated_length": 14.4, "epoch": 0.7288228822882288, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0549237057566643, "learning_rate": 1.0394346799920904e-06, "loss": 0.0, "num_tokens": 15230446.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10600 }, { "completion_length": 19.4, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 17.55, "completions/mean_terminated_length": 17.55, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.7295104510451045, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0190134070813657, "learning_rate": 1.034568985514901e-06, "loss": 0.0, "num_tokens": 15244652.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10610 }, { "completion_length": 17.3, "completions/clipped_ratio": 0.0, "completions/max_length": 17.3, "completions/max_terminated_length": 17.3, "completions/mean_length": 16.425, "completions/mean_terminated_length": 16.425, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.7301980198019802, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9635543545708061, "learning_rate": 1.029711733166637e-06, "loss": 0.0, "num_tokens": 15260489.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10620 }, { "completion_length": 19.8, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 17.9, "completions/mean_terminated_length": 17.9, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.7308855885588559, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1127851918339728, "learning_rate": 1.024862950929201e-06, "loss": 0.0, "num_tokens": 15276797.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10630 }, { "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 17.9, "completions/mean_terminated_length": 17.9, "completions/min_length": 16.9, "completions/min_terminated_length": 16.9, "epoch": 0.7315731573157316, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.112170697748661, "learning_rate": 1.0200226667357032e-06, "loss": 0.0, "num_tokens": 15290765.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10640 }, { "completion_length": 19.4, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 16.8, "completions/mean_terminated_length": 16.8, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.7322607260726073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.117611725628376, "learning_rate": 1.015190908470298e-06, "loss": 0.0, "num_tokens": 15306645.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10650 }, { "completion_length": 19.26, "completions/clipped_ratio": 0.0, "completions/max_length": 19.153846153846153, "completions/max_terminated_length": 19.153846153846153, "completions/mean_length": 17.942307692307693, "completions/mean_terminated_length": 17.942307692307693, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.732948294829483, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0812714359164237, "learning_rate": 1.01036770396802e-06, "loss": 0.0, "num_tokens": 15320489.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10660 }, { "completion_length": 19.225, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 17.05, "completions/mean_terminated_length": 17.05, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.7336358635863587, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3272650197148324, "learning_rate": 1.0055530810146303e-06, "loss": 0.0, "num_tokens": 15333095.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10670 }, { "completion_length": 16.825, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.4, "completions/mean_terminated_length": 15.4, "completions/min_length": 14.2, "completions/min_terminated_length": 14.2, "epoch": 0.7343234323432343, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0134508907794952, "learning_rate": 1.0007470673464527e-06, "loss": 0.0, "num_tokens": 15345835.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10680 }, { "completion_length": 19.125, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.025, "completions/mean_terminated_length": 18.025, "completions/min_length": 17.1, "completions/min_terminated_length": 17.1, "epoch": 0.73501100110011, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4031531304121017, "learning_rate": 9.959496906502112e-07, "loss": 0.0, "num_tokens": 15362448.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10690 }, { "completion_length": 18.825, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 16.45, "completions/mean_terminated_length": 16.45, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.7356985698569857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2530749052762986, "learning_rate": 9.911609785628765e-07, "loss": 0.0, "num_tokens": 15376242.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10700 }, { "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.7363861386138614, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1614648193120956, "learning_rate": 9.863809586715037e-07, "loss": 0.0, "num_tokens": 15394874.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10710 }, { "completion_length": 18.35, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 15.975, "completions/mean_terminated_length": 15.975, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.7370737073707371, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8098263021558523, "learning_rate": 9.816096585130709e-07, "loss": 0.0, "num_tokens": 15409397.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10720 }, { "completion_length": 19.85, "completions/clipped_ratio": 0.0, "completions/max_length": 19.9, "completions/max_terminated_length": 19.9, "completions/mean_length": 16.825, "completions/mean_terminated_length": 16.825, "completions/min_length": 14.4, "completions/min_terminated_length": 14.4, "epoch": 0.7377612761276128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9473735511302948, "learning_rate": 9.768471055743254e-07, "loss": 0.0, "num_tokens": 15424394.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10730 }, { "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.1, "completions/max_terminated_length": 19.1, "completions/mean_length": 16.825, "completions/mean_terminated_length": 16.825, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.7384488448844885, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.22924984395504, "learning_rate": 9.720933272916235e-07, "loss": 0.0, "num_tokens": 15437375.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10740 }, { "completion_length": 17.025, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.475, "completions/mean_terminated_length": 16.475, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.7391364136413642, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9831631615757942, "learning_rate": 9.673483510507692e-07, "loss": 0.0, "num_tokens": 15450350.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10750 }, { "completion_length": 18.325, "completions/clipped_ratio": 0.0, "completions/max_length": 18.2, "completions/max_terminated_length": 18.2, "completions/mean_length": 17.625, "completions/mean_terminated_length": 17.625, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.7398239823982399, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.060418665036559, "learning_rate": 9.62612204186862e-07, "loss": 0.0, "num_tokens": 15464067.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10760 }, { "completion_length": 20.55, "completions/clipped_ratio": 0.0, "completions/max_length": 20.7, "completions/max_terminated_length": 20.7, "completions/mean_length": 18.625, "completions/mean_terminated_length": 18.625, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.7405115511551155, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0736682925373315, "learning_rate": 9.578849139841362e-07, "loss": 0.0, "num_tokens": 15478704.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10770 }, { "completion_length": 18.6, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 16.15, "completions/mean_terminated_length": 16.15, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.7411991199119912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005870894528925419, "kl": 1.1081381618976593, "learning_rate": 9.531665076758026e-07, "loss": 0.0, "num_tokens": 15495570.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10780 }, { "completion_length": 16.95, "completions/clipped_ratio": 0.0, "completions/max_length": 16.7, "completions/max_terminated_length": 16.7, "completions/mean_length": 16.15, "completions/mean_terminated_length": 16.15, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.7418866886688669, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2765638522803784, "learning_rate": 9.484570124438946e-07, "loss": 0.0, "num_tokens": 15511384.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10790 }, { "completion_length": 18.125, "completions/clipped_ratio": 0.0, "completions/max_length": 18.3, "completions/max_terminated_length": 18.3, "completions/mean_length": 16.8, "completions/mean_terminated_length": 16.8, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.7425742574257426, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1013303212821484, "learning_rate": 9.437564554191111e-07, "loss": 0.0, "num_tokens": 15526608.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10800 }, { "completion_length": 17.125, "completions/clipped_ratio": 0.0, "completions/max_length": 17.3, "completions/max_terminated_length": 17.3, "completions/mean_length": 15.6, "completions/mean_terminated_length": 15.6, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.7432618261826183, "frac_reward_zero_std": 1.0, "grad_norm": 2.6196155886282213e-05, "kl": 1.0360828436911107, "learning_rate": 9.390648636806563e-07, "loss": 0.0, "num_tokens": 15540468.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10810 }, { "completion_length": 18.45, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 16.725, "completions/mean_terminated_length": 16.725, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.743949394939494, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9936655193567276, "learning_rate": 9.343822642560895e-07, "loss": 0.0, "num_tokens": 15556757.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10820 }, { "completion_length": 17.625, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 16.6, "completions/mean_terminated_length": 16.6, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.7446369636963697, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.056114462018013, "learning_rate": 9.297086841211669e-07, "loss": 0.0, "num_tokens": 15569505.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10830 }, { "completion_length": 19.3, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 17.9, "completions/mean_terminated_length": 17.9, "completions/min_length": 16.9, "completions/min_terminated_length": 16.9, "epoch": 0.7453245324532454, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1308002784848212, "learning_rate": 9.250441501996826e-07, "loss": 0.0, "num_tokens": 15581973.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10840 }, { "completion_length": 20.45, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 18.125, "completions/mean_terminated_length": 18.125, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.746012101210121, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0938160687685012, "learning_rate": 9.203886893633201e-07, "loss": 0.0, "num_tokens": 15596610.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10850 }, { "completion_length": 18.775, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 17.575, "completions/mean_terminated_length": 17.575, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.7466996699669967, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0044926359678357, "learning_rate": 9.157423284314937e-07, "loss": 0.0, "num_tokens": 15611085.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10860 }, { "completion_length": 19.775, "completions/clipped_ratio": 0.0, "completions/max_length": 19.9, "completions/max_terminated_length": 19.9, "completions/mean_length": 17.825, "completions/mean_terminated_length": 17.825, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.7473872387238724, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0912924561649562, "learning_rate": 9.111050941711921e-07, "loss": 0.0, "num_tokens": 15626426.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10870 }, { "completion_length": 16.925, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 16.475, "completions/mean_terminated_length": 16.475, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.7480748074807481, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0661979235708714, "learning_rate": 9.064770132968304e-07, "loss": 0.0, "num_tokens": 15641333.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10880 }, { "completion_length": 17.15, "completions/clipped_ratio": 0.0, "completions/max_length": 17.1, "completions/max_terminated_length": 17.1, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.7487623762376238, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.229330799728632, "learning_rate": 9.018581124700884e-07, "loss": 0.0, "num_tokens": 15653165.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10890 }, { "completion_length": 19.075, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 17.525, "completions/mean_terminated_length": 17.525, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.7494499449944995, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2027806408703328, "learning_rate": 8.972484182997643e-07, "loss": 0.0, "num_tokens": 15669818.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10900 }, { "completion_length": 21.375, "completions/clipped_ratio": 0.0, "completions/max_length": 21.2, "completions/max_terminated_length": 21.2, "completions/mean_length": 18.15, "completions/mean_terminated_length": 18.15, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.7501375137513752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.895715744048357, "learning_rate": 8.926479573416172e-07, "loss": 0.0, "num_tokens": 15684488.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10910 }, { "completion_length": 17.85, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 16.2, "completions/mean_terminated_length": 16.2, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.7508250825082509, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9663356371223927, "learning_rate": 8.880567560982134e-07, "loss": 0.0, "num_tokens": 15698420.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10920 }, { "completion_length": 17.975, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 16.8, "completions/mean_terminated_length": 16.8, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.7515126512651266, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.209364048205316, "learning_rate": 8.834748410187774e-07, "loss": 0.0, "num_tokens": 15710892.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10930 }, { "completion_length": 19.55, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 17.575, "completions/mean_terminated_length": 17.575, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.7522002200220022, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2022224888205528, "learning_rate": 8.789022384990372e-07, "loss": 0.0, "num_tokens": 15725903.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10940 }, { "completion_length": 17.075, "completions/clipped_ratio": 0.0, "completions/max_length": 16.9, "completions/max_terminated_length": 16.9, "completions/mean_length": 15.875, "completions/mean_terminated_length": 15.875, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.7528877887788779, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0960259534418584, "learning_rate": 8.743389748810712e-07, "loss": 0.0, "num_tokens": 15740358.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10950 }, { "completion_length": 17.65, "completions/clipped_ratio": 0.0, "completions/max_length": 17.9, "completions/max_terminated_length": 17.9, "completions/mean_length": 16.725, "completions/mean_terminated_length": 16.725, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.7535753575357536, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0783400520682336, "learning_rate": 8.697850764531587e-07, "loss": 0.0, "num_tokens": 15754639.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10960 }, { "completion_length": 19.225, "completions/clipped_ratio": 0.0, "completions/max_length": 19.2, "completions/max_terminated_length": 19.2, "completions/mean_length": 17.575, "completions/mean_terminated_length": 17.575, "completions/min_length": 16.7, "completions/min_terminated_length": 16.7, "epoch": 0.7542629262926293, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1874236330389976, "learning_rate": 8.652405694496291e-07, "loss": 0.0, "num_tokens": 15767538.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10970 }, { "completion_length": 19.875, "completions/clipped_ratio": 0.0, "completions/max_length": 19.7, "completions/max_terminated_length": 19.7, "completions/mean_length": 16.7, "completions/mean_terminated_length": 16.7, "completions/min_length": 14.3, "completions/min_terminated_length": 14.3, "epoch": 0.754950495049505, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9586891086772085, "learning_rate": 8.607054800507056e-07, "loss": 0.0, "num_tokens": 15780158.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10980 }, { "completion_length": 16.725, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 15.575, "completions/mean_terminated_length": 15.575, "completions/min_length": 14.2, "completions/min_terminated_length": 14.2, "epoch": 0.7556380638063807, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0593073539435864, "learning_rate": 8.561798343823605e-07, "loss": 0.0, "num_tokens": 15795585.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 10990 }, { "completion_length": 19.35, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 18.2, "completions/mean_terminated_length": 18.2, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.7563256325632564, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3741861015558243, "learning_rate": 8.516636585161634e-07, "loss": 0.0, "num_tokens": 15810417.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11000 }, { "completion_length": 20.925, "completions/clipped_ratio": 0.0, "completions/max_length": 20.9, "completions/max_terminated_length": 20.9, "completions/mean_length": 18.025, "completions/mean_terminated_length": 18.025, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.7570132013201321, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9687342491000891, "learning_rate": 8.471569784691264e-07, "loss": 0.0, "num_tokens": 15823666.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11010 }, { "completion_length": 20.65, "completions/clipped_ratio": 0.0, "completions/max_length": 20.4, "completions/max_terminated_length": 20.4, "completions/mean_length": 18.675, "completions/mean_terminated_length": 18.675, "completions/min_length": 16.7, "completions/min_terminated_length": 16.7, "epoch": 0.7577007700770076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1399923454970122, "learning_rate": 8.426598202035579e-07, "loss": 0.0, "num_tokens": 15838349.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11020 }, { "completion_length": 19.95, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.225, "completions/mean_terminated_length": 18.225, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.7583883388338833, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9907348096370697, "learning_rate": 8.38172209626918e-07, "loss": 0.0, "num_tokens": 15853046.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11030 }, { "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.7, "completions/max_terminated_length": 16.7, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.9, "completions/min_terminated_length": 13.9, "epoch": 0.759075907590759, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.256610096991062, "learning_rate": 8.336941725916575e-07, "loss": 0.0, "num_tokens": 15868708.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11040 }, { "completion_length": 18.275, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 17.225, "completions/mean_terminated_length": 17.225, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.7597634763476347, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0946700729429721, "learning_rate": 8.292257348950772e-07, "loss": 0.0, "num_tokens": 15881905.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11050 }, { "completion_length": 19.325, "completions/clipped_ratio": 0.0, "completions/max_length": 19.2, "completions/max_terminated_length": 19.2, "completions/mean_length": 17.375, "completions/mean_terminated_length": 17.375, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.7604510451045104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1086218349635601, "learning_rate": 8.247669222791815e-07, "loss": 0.0, "num_tokens": 15893120.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11060 }, { "completion_length": 17.825, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 14.8, "completions/min_terminated_length": 14.8, "epoch": 0.7611386138613861, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1578792456537486, "learning_rate": 8.203177604305215e-07, "loss": 0.0, "num_tokens": 15907908.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11070 }, { "completion_length": 20.925, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.2, "completions/mean_terminated_length": 18.2, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.7618261826182618, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.014317450672388, "learning_rate": 8.158782749800515e-07, "loss": 0.0, "num_tokens": 15922136.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11080 }, { "completion_length": 16.525, "completions/clipped_ratio": 0.0, "completions/max_length": 16.4, "completions/max_terminated_length": 16.4, "completions/mean_length": 15.525, "completions/mean_terminated_length": 15.525, "completions/min_length": 14.8, "completions/min_terminated_length": 14.8, "epoch": 0.7625137513751375, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3503883443772793, "learning_rate": 8.114484915029863e-07, "loss": 0.0, "num_tokens": 15936637.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11090 }, { "completion_length": 17.725, "completions/clipped_ratio": 0.0, "completions/max_length": 17.6, "completions/max_terminated_length": 17.6, "completions/mean_length": 16.3, "completions/mean_terminated_length": 16.3, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.7632013201320133, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0547269459581003, "learning_rate": 8.070284355186447e-07, "loss": 0.0, "num_tokens": 15951961.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11100 }, { "completion_length": 17.07547169811321, "completions/clipped_ratio": 0.0, "completions/max_length": 17.153846153846153, "completions/max_terminated_length": 17.153846153846153, "completions/mean_length": 16.653846153846153, "completions/mean_terminated_length": 16.653846153846153, "completions/min_length": 16.307692307692307, "completions/min_terminated_length": 16.307692307692307, "epoch": 0.7638888888888888, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4417094655756681, "learning_rate": 8.026181324903068e-07, "loss": 0.0, "num_tokens": 15967185.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11110 }, { "completion_length": 16.9, "completions/clipped_ratio": 0.0, "completions/max_length": 16.9, "completions/max_terminated_length": 16.9, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.7645764576457645, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2374431431293487, "learning_rate": 7.98217607825072e-07, "loss": 0.0, "num_tokens": 15982335.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11120 }, { "completion_length": 18.7, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 16.675, "completions/mean_terminated_length": 16.675, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.7652640264026402, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.313457754254341, "learning_rate": 7.938268868737026e-07, "loss": 0.0, "num_tokens": 15998082.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11130 }, { "completion_length": 20.6, "completions/clipped_ratio": 0.0, "completions/max_length": 20.6, "completions/max_terminated_length": 20.6, "completions/mean_length": 18.475, "completions/mean_terminated_length": 18.475, "completions/min_length": 17.2, "completions/min_terminated_length": 17.2, "epoch": 0.7659515951595159, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8375833176076413, "learning_rate": 7.894459949304844e-07, "loss": 0.0, "num_tokens": 16014065.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11140 }, { "completion_length": 18.7, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 16.275, "completions/mean_terminated_length": 16.275, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.7666391639163916, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2038681738078594, "learning_rate": 7.850749572330838e-07, "loss": 0.0, "num_tokens": 16027484.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11150 }, { "completion_length": 19.4, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 16.9, "completions/mean_terminated_length": 16.9, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.7673267326732673, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2629554875195026, "learning_rate": 7.807137989623936e-07, "loss": 0.0, "num_tokens": 16041616.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11160 }, { "completion_length": 18.8, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 17.725, "completions/mean_terminated_length": 17.725, "completions/min_length": 17.2, "completions/min_terminated_length": 17.2, "epoch": 0.768014301430143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4642689004540443, "learning_rate": 7.763625452423937e-07, "loss": 0.0001, "num_tokens": 16058189.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11170 }, { "completion_length": 16.8, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 16.35, "completions/mean_terminated_length": 16.35, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.7687018701870187, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1750748664140702, "learning_rate": 7.720212211400066e-07, "loss": 0.0, "num_tokens": 16073455.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11180 }, { "completion_length": 18.1, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 16.925, "completions/mean_terminated_length": 16.925, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.7693894389438944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1646143794059753, "learning_rate": 7.676898516649525e-07, "loss": 0.0, "num_tokens": 16089164.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11190 }, { "completion_length": 21.2, "completions/clipped_ratio": 0.0, "completions/max_length": 21.2, "completions/max_terminated_length": 21.2, "completions/mean_length": 18.075, "completions/mean_terminated_length": 18.075, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.77007700770077, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.119681691750884, "learning_rate": 7.63368461769601e-07, "loss": 0.0, "num_tokens": 16106619.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11200 }, { "completion_length": 19.8, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 18.4, "completions/mean_terminated_length": 18.4, "completions/min_length": 17.3, "completions/min_terminated_length": 17.3, "epoch": 0.7707645764576457, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0832272429019212, "learning_rate": 7.590570763488347e-07, "loss": 0.0, "num_tokens": 16121667.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11210 }, { "completion_length": 18.8, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 17.65, "completions/mean_terminated_length": 17.65, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.7714521452145214, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3181848049163818, "learning_rate": 7.547557202399003e-07, "loss": 0.0, "num_tokens": 16136101.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11220 }, { "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 16.85, "completions/mean_terminated_length": 16.85, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.7721397139713971, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2974165245890616, "learning_rate": 7.504644182222656e-07, "loss": 0.0, "num_tokens": 16149939.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11230 }, { "completion_length": 16.7, "completions/clipped_ratio": 0.0, "completions/max_length": 16.7, "completions/max_terminated_length": 16.7, "completions/mean_length": 15.775, "completions/mean_terminated_length": 15.775, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.7728272827282728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9349052540957927, "learning_rate": 7.461831950174808e-07, "loss": 0.0, "num_tokens": 16164334.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11240 }, { "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 17.975, "completions/mean_terminated_length": 17.975, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.7735148514851485, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.066166415065527, "learning_rate": 7.419120752890333e-07, "loss": 0.0, "num_tokens": 16179677.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11250 }, { "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 17.3, "completions/mean_terminated_length": 17.3, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.7742024202420242, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8953022979199886, "learning_rate": 7.376510836422032e-07, "loss": 0.0, "num_tokens": 16194973.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11260 }, { "completion_length": 17.3, "completions/clipped_ratio": 0.0, "completions/max_length": 17.3, "completions/max_terminated_length": 17.3, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.7748899889988999, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1213195119053125, "learning_rate": 7.334002446239274e-07, "loss": 0.0, "num_tokens": 16211415.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11270 }, { "completion_length": 18.9, "completions/clipped_ratio": 0.0, "completions/max_length": 18.9, "completions/max_terminated_length": 18.9, "completions/mean_length": 17.325, "completions/mean_terminated_length": 17.325, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.7755775577557755, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8371943077072501, "learning_rate": 7.291595827226547e-07, "loss": 0.0, "num_tokens": 16227240.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11280 }, { "completion_length": 19.8, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 17.125, "completions/mean_terminated_length": 17.125, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.7762651265126512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2180199906229974, "learning_rate": 7.249291223682021e-07, "loss": 0.0, "num_tokens": 16241409.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11290 }, { "completion_length": 18.3, "completions/clipped_ratio": 0.0, "completions/max_length": 18.3, "completions/max_terminated_length": 18.3, "completions/mean_length": 16.65, "completions/mean_terminated_length": 16.65, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.7769526952695269, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.090780709683895, "learning_rate": 7.207088879316199e-07, "loss": 0.0, "num_tokens": 16255099.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11300 }, { "completion_length": 19.2, "completions/clipped_ratio": 0.0, "completions/max_length": 19.2, "completions/max_terminated_length": 19.2, "completions/mean_length": 17.475, "completions/mean_terminated_length": 17.475, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.7776402640264026, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9828387029469013, "learning_rate": 7.164989037250483e-07, "loss": 0.0, "num_tokens": 16269658.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11310 }, { "completion_length": 16.6, "completions/clipped_ratio": 0.0, "completions/max_length": 16.6, "completions/max_terminated_length": 16.6, "completions/mean_length": 16.225, "completions/mean_terminated_length": 16.225, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.7783278327832783, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4672671839594842, "learning_rate": 7.122991940015756e-07, "loss": 0.0, "num_tokens": 16282775.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11320 }, { "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.125, "completions/mean_terminated_length": 17.125, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.779015401540154, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2950967982411385, "learning_rate": 7.081097829551026e-07, "loss": 0.0, "num_tokens": 16298468.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11330 }, { "completion_length": 18.6, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 16.85, "completions/mean_terminated_length": 16.85, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.7797029702970297, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0951971739530564, "learning_rate": 7.039306947201985e-07, "loss": 0.0, "num_tokens": 16310814.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11340 }, { "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.85, "completions/mean_terminated_length": 15.85, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.7803905390539054, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9441638633608818, "learning_rate": 6.997619533719663e-07, "loss": 0.0, "num_tokens": 16324880.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11350 }, { "completion_length": 17.9, "completions/clipped_ratio": 0.0, "completions/max_length": 17.9, "completions/max_terminated_length": 17.9, "completions/mean_length": 16.425, "completions/mean_terminated_length": 16.425, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.7810781078107811, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1288403883576392, "learning_rate": 6.956035829259025e-07, "loss": 0.0, "num_tokens": 16340389.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11360 }, { "completion_length": 18.8, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 17.1, "completions/mean_terminated_length": 17.1, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.7817656765676567, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1189744628965854, "learning_rate": 6.914556073377554e-07, "loss": 0.0, "num_tokens": 16355577.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11370 }, { "completion_length": 17.2, "completions/clipped_ratio": 0.0, "completions/max_length": 17.2, "completions/max_terminated_length": 17.2, "completions/mean_length": 16.2, "completions/mean_terminated_length": 16.2, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.7824532453245324, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2965394303202629, "learning_rate": 6.87318050503393e-07, "loss": 0.0, "num_tokens": 16371945.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11380 }, { "completion_length": 17.8, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 16.125, "completions/mean_terminated_length": 16.125, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.7831408140814081, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0501157040242106, "learning_rate": 6.831909362586625e-07, "loss": 0.0, "num_tokens": 16385930.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11390 }, { "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 16.625, "completions/mean_terminated_length": 16.625, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.7838283828382838, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2721909329295158, "learning_rate": 6.7907428837925e-07, "loss": 0.0, "num_tokens": 16401023.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11400 }, { "completion_length": 17.2, "completions/clipped_ratio": 0.0, "completions/max_length": 17.2, "completions/max_terminated_length": 17.2, "completions/mean_length": 15.875, "completions/mean_terminated_length": 15.875, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.7845159515951595, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1036891281604766, "learning_rate": 6.749681305805494e-07, "loss": 0.0, "num_tokens": 16415422.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11410 }, { "completion_length": 17.1, "completions/clipped_ratio": 0.0, "completions/max_length": 17.1, "completions/max_terminated_length": 17.1, "completions/mean_length": 15.975, "completions/mean_terminated_length": 15.975, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.7852035203520352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2872200727462768, "learning_rate": 6.70872486517522e-07, "loss": 0.0, "num_tokens": 16430057.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11420 }, { "completion_length": 20.2, "completions/clipped_ratio": 0.0, "completions/max_length": 20.2, "completions/max_terminated_length": 20.2, "completions/mean_length": 17.975, "completions/mean_terminated_length": 17.975, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.7858910891089109, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.969875092804432, "learning_rate": 6.667873797845598e-07, "loss": 0.0, "num_tokens": 16441728.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11430 }, { "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 17.4, "completions/mean_terminated_length": 17.4, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.7865786578657866, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1505745474249125, "learning_rate": 6.627128339153519e-07, "loss": 0.0, "num_tokens": 16455928.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11440 }, { "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 17.475, "completions/mean_terminated_length": 17.475, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.7872662266226622, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1520945608615876, "learning_rate": 6.586488723827489e-07, "loss": 0.0, "num_tokens": 16470611.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11450 }, { "completion_length": 19.2, "completions/clipped_ratio": 0.0, "completions/max_length": 19.2, "completions/max_terminated_length": 19.2, "completions/mean_length": 16.325, "completions/mean_terminated_length": 16.325, "completions/min_length": 14.8, "completions/min_terminated_length": 14.8, "epoch": 0.7879537953795379, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2398535877466201, "learning_rate": 6.545955185986239e-07, "loss": 0.0, "num_tokens": 16483880.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11460 }, { "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.8, "completions/mean_terminated_length": 15.8, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.7886413641364136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1811271376907826, "learning_rate": 6.505527959137429e-07, "loss": 0.0, "num_tokens": 16497636.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11470 }, { "completion_length": 18.6, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 17.6, "completions/mean_terminated_length": 17.6, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.7893289328932893, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.083561733365059, "learning_rate": 6.465207276176267e-07, "loss": 0.0, "num_tokens": 16511392.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11480 }, { "completion_length": 17.7, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 16.3, "completions/mean_terminated_length": 16.3, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.790016501650165, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.18923562746495, "learning_rate": 6.424993369384169e-07, "loss": 0.0, "num_tokens": 16525208.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11490 }, { "completion_length": 17.2, "completions/clipped_ratio": 0.0, "completions/max_length": 17.2, "completions/max_terminated_length": 17.2, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.7907040704070407, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4898881241679192, "learning_rate": 6.384886470427443e-07, "loss": 0.0, "num_tokens": 16541084.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11500 }, { "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.475, "completions/mean_terminated_length": 16.475, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.7913916391639164, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9230179768055677, "learning_rate": 6.344886810355944e-07, "loss": 0.0, "num_tokens": 16555483.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11510 }, { "completion_length": 18.6, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 17.375, "completions/mean_terminated_length": 17.375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.7920792079207921, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8842333287000657, "learning_rate": 6.304994619601721e-07, "loss": 0.0, "num_tokens": 16571166.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11520 }, { "completion_length": 16.9, "completions/clipped_ratio": 0.0, "completions/max_length": 16.9, "completions/max_terminated_length": 16.9, "completions/mean_length": 16.475, "completions/mean_terminated_length": 16.475, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.7927667766776678, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9936220183968544, "learning_rate": 6.26521012797772e-07, "loss": 0.0, "num_tokens": 16584989.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11530 }, { "completion_length": 19.3, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 17.4, "completions/mean_terminated_length": 17.4, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.7934543454345434, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1388952665030956, "learning_rate": 6.225533564676456e-07, "loss": 0.0, "num_tokens": 16600197.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11540 }, { "completion_length": 19.4, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 17.275, "completions/mean_terminated_length": 17.275, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.7941419141914191, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3170918300747871, "learning_rate": 6.185965158268667e-07, "loss": 0.0, "num_tokens": 16616340.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11550 }, { "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.7948294829482948, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0344215095043183, "learning_rate": 6.146505136702027e-07, "loss": 0.0, "num_tokens": 16632526.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11560 }, { "completion_length": 18.4, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 17.275, "completions/mean_terminated_length": 17.275, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.7955170517051705, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.230242745578289, "learning_rate": 6.107153727299834e-07, "loss": 0.0, "num_tokens": 16647513.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11570 }, { "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.1, "completions/mean_terminated_length": 17.1, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.7962046204620462, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1020235106348992, "learning_rate": 6.067911156759648e-07, "loss": 0.0, "num_tokens": 16663637.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11580 }, { "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.65, "completions/mean_terminated_length": 15.65, "completions/min_length": 14.4, "completions/min_terminated_length": 14.4, "epoch": 0.7968921892189219, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.144069493561983, "learning_rate": 6.028777651152062e-07, "loss": 0.0, "num_tokens": 16678595.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11590 }, { "completion_length": 18.7, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 16.875, "completions/mean_terminated_length": 16.875, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.7975797579757976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3637000352144242, "learning_rate": 5.989753435919357e-07, "loss": 0.0001, "num_tokens": 16692162.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11600 }, { "completion_length": 18.416666666666668, "completions/clipped_ratio": 0.0, "completions/max_length": 18.416666666666668, "completions/max_terminated_length": 18.416666666666668, "completions/mean_length": 16.979166666666668, "completions/mean_terminated_length": 16.979166666666668, "completions/min_length": 15.416666666666666, "completions/min_terminated_length": 15.416666666666666, "epoch": 0.7982673267326733, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.09130197763443, "learning_rate": 5.950838735874184e-07, "loss": 0.0, "num_tokens": 16705471.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11610 }, { "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.2, "completions/mean_terminated_length": 18.2, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.7989548954895489, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1447665378451348, "learning_rate": 5.912033775198331e-07, "loss": 0.0, "num_tokens": 16720619.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11620 }, { "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 16.95, "completions/mean_terminated_length": 16.95, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.7996424642464246, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0139610884711145, "learning_rate": 5.873338777441351e-07, "loss": 0.0, "num_tokens": 16734125.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11630 }, { "completion_length": 19.9, "completions/clipped_ratio": 0.0, "completions/max_length": 19.9, "completions/max_terminated_length": 19.9, "completions/mean_length": 17.325, "completions/mean_terminated_length": 17.325, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.8003300330033003, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.306389456987381, "learning_rate": 5.834753965519349e-07, "loss": 0.0001, "num_tokens": 16750174.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11640 }, { "completion_length": 18.2, "completions/clipped_ratio": 0.0, "completions/max_length": 18.2, "completions/max_terminated_length": 18.2, "completions/mean_length": 17.275, "completions/mean_terminated_length": 17.275, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.801017601760176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1532538570463657, "learning_rate": 5.796279561713666e-07, "loss": 0.0, "num_tokens": 16764337.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11650 }, { "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 16.775, "completions/mean_terminated_length": 16.775, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.8017051705170517, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9973506443202496, "learning_rate": 5.757915787669574e-07, "loss": 0.0, "num_tokens": 16777904.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11660 }, { "completion_length": 16.8, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 16.475, "completions/mean_terminated_length": 16.475, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.8023927392739274, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3113906651735305, "learning_rate": 5.719662864395045e-07, "loss": 0.0, "num_tokens": 16791731.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11670 }, { "completion_length": 20.1, "completions/clipped_ratio": 0.0, "completions/max_length": 20.1, "completions/max_terminated_length": 20.1, "completions/mean_length": 18.8, "completions/mean_terminated_length": 18.8, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.8030803080308031, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2495213493704795, "learning_rate": 5.681521012259453e-07, "loss": 0.0, "num_tokens": 16806199.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11680 }, { "completion_length": 21.2, "completions/clipped_ratio": 0.0, "completions/max_length": 21.2, "completions/max_terminated_length": 21.2, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.8037678767876788, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.057341678440571, "learning_rate": 5.643490450992292e-07, "loss": 0.0, "num_tokens": 16821401.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11690 }, { "completion_length": 17.2, "completions/clipped_ratio": 0.0, "completions/max_length": 17.2, "completions/max_terminated_length": 17.2, "completions/mean_length": 16.35, "completions/mean_terminated_length": 16.35, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.8044554455445545, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.7063321620225906, "learning_rate": 5.605571399681947e-07, "loss": 0.0001, "num_tokens": 16838507.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11700 }, { "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.7, "completions/mean_terminated_length": 18.7, "completions/min_length": 16.8, "completions/min_terminated_length": 16.8, "epoch": 0.8051430143014301, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.971458313986659, "learning_rate": 5.567764076774409e-07, "loss": 0.0, "num_tokens": 16853143.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11710 }, { "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.5, "completions/max_terminated_length": 16.5, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.4, "completions/min_terminated_length": 14.4, "epoch": 0.8058305830583058, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1677193390205503, "learning_rate": 5.530068700071994e-07, "loss": 0.0, "num_tokens": 16867949.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11720 }, { "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.5, "completions/max_terminated_length": 17.5, "completions/mean_length": 16.325, "completions/mean_terminated_length": 16.325, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.8065181518151815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4122251868247986, "learning_rate": 5.492485486732141e-07, "loss": 0.0, "num_tokens": 16883650.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11730 }, { "completion_length": 18.6, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 17.05, "completions/mean_terminated_length": 17.05, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.8072057205720572, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4119791135191917, "learning_rate": 5.455014653266124e-07, "loss": 0.0001, "num_tokens": 16898232.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11740 }, { "completion_length": 16.4, "completions/clipped_ratio": 0.0, "completions/max_length": 16.4, "completions/max_terminated_length": 16.4, "completions/mean_length": 14.8, "completions/mean_terminated_length": 14.8, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.8078932893289329, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1706212140619754, "learning_rate": 5.417656415537803e-07, "loss": 0.0, "num_tokens": 16910052.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11750 }, { "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.5, "completions/max_terminated_length": 17.5, "completions/mean_length": 16.7, "completions/mean_terminated_length": 16.7, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.8085808580858086, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0780875638127327, "learning_rate": 5.3804109887624e-07, "loss": 0.0, "num_tokens": 16925640.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11760 }, { "completion_length": 18.3, "completions/clipped_ratio": 0.0, "completions/max_length": 18.3, "completions/max_terminated_length": 18.3, "completions/mean_length": 17.475, "completions/mean_terminated_length": 17.475, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.8092684268426843, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9206266243010759, "learning_rate": 5.34327858750526e-07, "loss": 0.0, "num_tokens": 16939243.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11770 }, { "completion_length": 19.9, "completions/clipped_ratio": 0.0, "completions/max_length": 19.9, "completions/max_terminated_length": 19.9, "completions/mean_length": 17.075, "completions/mean_terminated_length": 17.075, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.80995599559956, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2972722038626672, "learning_rate": 5.306259425680582e-07, "loss": 0.0, "num_tokens": 16952522.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11780 }, { "completion_length": 19.1, "completions/clipped_ratio": 0.0, "completions/max_length": 19.1, "completions/max_terminated_length": 19.1, "completions/mean_length": 17.55, "completions/mean_terminated_length": 17.55, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.8106435643564357, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0005511239171028, "learning_rate": 5.26935371655021e-07, "loss": 0.0, "num_tokens": 16967364.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11790 }, { "completion_length": 17.7, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 16.35, "completions/mean_terminated_length": 16.35, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.8113311331133113, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.019502779468894, "learning_rate": 5.232561672722444e-07, "loss": 0.0, "num_tokens": 16979394.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11800 }, { "completion_length": 20.842105263157894, "completions/clipped_ratio": 0.0, "completions/max_length": 20.842105263157894, "completions/max_terminated_length": 20.842105263157894, "completions/mean_length": 18.19736842105263, "completions/mean_terminated_length": 18.19736842105263, "completions/min_length": 16.263157894736842, "completions/min_terminated_length": 16.263157894736842, "epoch": 0.812018701870187, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.208753490918561, "learning_rate": 5.195883506150734e-07, "loss": 0.0, "num_tokens": 16992543.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11810 }, { "completion_length": 21.9, "completions/clipped_ratio": 0.0, "completions/max_length": 21.9, "completions/max_terminated_length": 21.9, "completions/mean_length": 18.675, "completions/mean_terminated_length": 18.675, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.8127062706270627, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1330634988844395, "learning_rate": 5.159319428132492e-07, "loss": 0.0, "num_tokens": 17009594.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11820 }, { "completion_length": 20.9, "completions/clipped_ratio": 0.0, "completions/max_length": 20.9, "completions/max_terminated_length": 20.9, "completions/mean_length": 18.225, "completions/mean_terminated_length": 18.225, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.8133938393839384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.196343858540058, "learning_rate": 5.122869649307938e-07, "loss": 0.0, "num_tokens": 17023455.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11830 }, { "completion_length": 19.4, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 17.625, "completions/mean_terminated_length": 17.625, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.8140814081408141, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1742313604801893, "learning_rate": 5.086534379658778e-07, "loss": 0.0, "num_tokens": 17039332.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11840 }, { "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 18.1, "completions/mean_terminated_length": 18.1, "completions/min_length": 16.7, "completions/min_terminated_length": 16.7, "epoch": 0.8147689768976898, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1800262570381164, "learning_rate": 5.050313828507055e-07, "loss": 0.0, "num_tokens": 17052828.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11850 }, { "completion_length": 21.3, "completions/clipped_ratio": 0.0, "completions/max_length": 21.3, "completions/max_terminated_length": 21.3, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.8154565456545655, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1351016148924828, "learning_rate": 5.014208204513978e-07, "loss": 0.0, "num_tokens": 17065986.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11860 }, { "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 18.05, "completions/mean_terminated_length": 18.05, "completions/min_length": 16.8, "completions/min_terminated_length": 16.8, "epoch": 0.8161441144114412, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9795697268098593, "learning_rate": 4.97821771567863e-07, "loss": 0.0, "num_tokens": 17080304.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11870 }, { "completion_length": 19.4, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 16.425, "completions/mean_terminated_length": 16.425, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.8168316831683168, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0502688817679882, "learning_rate": 4.942342569336822e-07, "loss": 0.0, "num_tokens": 17096789.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11880 }, { "completion_length": 15.9, "completions/clipped_ratio": 0.0, "completions/max_length": 15.9, "completions/max_terminated_length": 15.9, "completions/mean_length": 15.625, "completions/mean_terminated_length": 15.625, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.8175192519251925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.022003109753132, "learning_rate": 4.906582972159934e-07, "loss": 0.0, "num_tokens": 17111030.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11890 }, { "completion_length": 18.9, "completions/clipped_ratio": 0.0, "completions/max_length": 18.9, "completions/max_terminated_length": 18.9, "completions/mean_length": 16.775, "completions/mean_terminated_length": 16.775, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.8182068206820682, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2291306972503662, "learning_rate": 4.870939130153643e-07, "loss": 0.0, "num_tokens": 17125137.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11900 }, { "completion_length": 19.9, "completions/clipped_ratio": 0.0, "completions/max_length": 19.9, "completions/max_terminated_length": 19.9, "completions/mean_length": 17.525, "completions/mean_terminated_length": 17.525, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.8188943894389439, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4226929068565368, "learning_rate": 4.835411248656783e-07, "loss": 0.0001, "num_tokens": 17138670.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11910 }, { "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.575, "completions/mean_terminated_length": 17.575, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.8195819581958196, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.151364193856716, "learning_rate": 4.799999532340188e-07, "loss": 0.0, "num_tokens": 17153253.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11920 }, { "completion_length": 19.1, "completions/clipped_ratio": 0.0, "completions/max_length": 19.1, "completions/max_terminated_length": 19.1, "completions/mean_length": 17.475, "completions/mean_terminated_length": 17.475, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.8202695269526953, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.6235528096556664, "learning_rate": 4.7647041852054517e-07, "loss": 0.0001, "num_tokens": 17169132.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11930 }, { "completion_length": 17.4, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 16.375, "completions/mean_terminated_length": 16.375, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.820957095709571, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.519204391539097, "learning_rate": 4.7295254105837795e-07, "loss": 0.0001, "num_tokens": 17183219.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11940 }, { "completion_length": 18.9, "completions/clipped_ratio": 0.0, "completions/max_length": 18.9, "completions/max_terminated_length": 18.9, "completions/mean_length": 16.2, "completions/mean_terminated_length": 16.2, "completions/min_length": 14.2, "completions/min_terminated_length": 14.2, "epoch": 0.8216446644664467, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.19991021938622, "learning_rate": 4.694463411134839e-07, "loss": 0.0, "num_tokens": 17197323.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11950 }, { "completion_length": 18.8, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 17.4, "completions/mean_terminated_length": 17.4, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.8223322332233224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3400846868753433, "learning_rate": 4.659518388845574e-07, "loss": 0.0, "num_tokens": 17213383.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11960 }, { "completion_length": 19.8, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 17.125, "completions/mean_terminated_length": 17.125, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.823019801980198, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4152716927230358, "learning_rate": 4.6246905450290177e-07, "loss": 0.0001, "num_tokens": 17226944.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11970 }, { "completion_length": 19.7, "completions/clipped_ratio": 0.0, "completions/max_length": 19.7, "completions/max_terminated_length": 19.7, "completions/mean_length": 16.7, "completions/mean_terminated_length": 16.7, "completions/min_length": 14.6, "completions/min_terminated_length": 14.6, "epoch": 0.8237073707370737, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8381059406907297, "learning_rate": 4.5899800803231753e-07, "loss": 0.0, "num_tokens": 17241132.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11980 }, { "completion_length": 17.4, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 16.125, "completions/mean_terminated_length": 16.125, "completions/min_length": 14.8, "completions/min_terminated_length": 14.8, "epoch": 0.8243949394939494, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2028470493853092, "learning_rate": 4.5553871946898533e-07, "loss": 0.0, "num_tokens": 17253357.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 11990 }, { "completion_length": 17.2, "completions/clipped_ratio": 0.0, "completions/max_length": 17.2, "completions/max_terminated_length": 17.2, "completions/mean_length": 16.275, "completions/mean_terminated_length": 16.275, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.8250825082508251, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0104025208100211, "learning_rate": 4.520912087413476e-07, "loss": 0.0, "num_tokens": 17266712.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12000 }, { "completion_length": 17.3, "completions/clipped_ratio": 0.0, "completions/max_length": 17.3, "completions/max_terminated_length": 17.3, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.8257700770077008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1107265980914236, "learning_rate": 4.4865549570999865e-07, "loss": 0.0, "num_tokens": 17279424.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12010 }, { "completion_length": 17.1, "completions/clipped_ratio": 0.0, "completions/max_length": 17.1, "completions/max_terminated_length": 17.1, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.8264576457645765, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.145281489752233, "learning_rate": 4.4523160016756853e-07, "loss": 0.0, "num_tokens": 17293686.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12020 }, { "completion_length": 16.4, "completions/clipped_ratio": 0.0, "completions/max_length": 16.4, "completions/max_terminated_length": 16.4, "completions/mean_length": 15.825, "completions/mean_terminated_length": 15.825, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.8271452145214522, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.5294427379965783, "learning_rate": 4.418195418386059e-07, "loss": 0.0, "num_tokens": 17306271.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12030 }, { "completion_length": 20.3, "completions/clipped_ratio": 0.0, "completions/max_length": 20.3, "completions/max_terminated_length": 20.3, "completions/mean_length": 17.075, "completions/mean_terminated_length": 17.075, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.8278327832783279, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.342365524172783, "learning_rate": 4.38419340379469e-07, "loss": 0.0001, "num_tokens": 17317698.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12040 }, { "completion_length": 16.7, "completions/clipped_ratio": 0.0, "completions/max_length": 16.7, "completions/max_terminated_length": 16.7, "completions/mean_length": 15.6, "completions/mean_terminated_length": 15.6, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.8285203520352035, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9734742298722268, "learning_rate": 4.350310153782108e-07, "loss": 0.0, "num_tokens": 17332362.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12050 }, { "completion_length": 19.6, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 18.3, "completions/mean_terminated_length": 18.3, "completions/min_length": 17.1, "completions/min_terminated_length": 17.1, "epoch": 0.8292079207920792, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9848667308688164, "learning_rate": 4.3165458635446385e-07, "loss": 0.0, "num_tokens": 17348478.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12060 }, { "completion_length": 18.3, "completions/clipped_ratio": 0.0, "completions/max_length": 18.3, "completions/max_terminated_length": 18.3, "completions/mean_length": 16.95, "completions/mean_terminated_length": 16.95, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.8298954895489549, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3417460560798644, "learning_rate": 4.282900727593317e-07, "loss": 0.0, "num_tokens": 17364152.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12070 }, { "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.5, "completions/max_terminated_length": 15.5, "completions/mean_length": 15.075, "completions/mean_terminated_length": 15.075, "completions/min_length": 14.8, "completions/min_terminated_length": 14.8, "epoch": 0.8305830583058306, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0196630969643592, "learning_rate": 4.249374939752729e-07, "loss": 0.0, "num_tokens": 17378167.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12080 }, { "completion_length": 19.1, "completions/clipped_ratio": 0.0, "completions/max_length": 19.1, "completions/max_terminated_length": 19.1, "completions/mean_length": 18.15, "completions/mean_terminated_length": 18.15, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.8312706270627063, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3768038228154182, "learning_rate": 4.2159686931599304e-07, "loss": 0.0, "num_tokens": 17393413.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12090 }, { "completion_length": 19.6, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 16.775, "completions/mean_terminated_length": 16.775, "completions/min_length": 14.8, "completions/min_terminated_length": 14.8, "epoch": 0.831958195819582, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.991391484439373, "learning_rate": 4.1826821802633197e-07, "loss": 0.0, "num_tokens": 17408768.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12100 }, { "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.5, "completions/max_terminated_length": 15.5, "completions/mean_length": 15.1, "completions/mean_terminated_length": 15.1, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.8326457645764577, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1597263135015965, "learning_rate": 4.149515592821507e-07, "loss": 0.0, "num_tokens": 17422016.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12110 }, { "completion_length": 17.9, "completions/clipped_ratio": 0.0, "completions/max_length": 17.9, "completions/max_terminated_length": 17.9, "completions/mean_length": 17.1, "completions/mean_terminated_length": 17.1, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.8333333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3087869144976139, "learning_rate": 4.1164691219022473e-07, "loss": 0.0, "num_tokens": 17433988.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12120 }, { "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.05, "completions/mean_terminated_length": 17.05, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.8340209020902091, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0112441785633564, "learning_rate": 4.083542957881326e-07, "loss": 0.0, "num_tokens": 17447030.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12130 }, { "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 16.8, "completions/mean_terminated_length": 16.8, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.8347084708470847, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.188933937251568, "learning_rate": 4.0507372904414364e-07, "loss": 0.0, "num_tokens": 17462114.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12140 }, { "completion_length": 17.7, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 16.4, "completions/mean_terminated_length": 16.4, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.8353960396039604, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.171751768887043, "learning_rate": 4.018052308571127e-07, "loss": 0.0, "num_tokens": 17476490.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12150 }, { "completion_length": 19.6, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 16.225, "completions/mean_terminated_length": 16.225, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.8360836083608361, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3831602782011032, "learning_rate": 3.9854882005637003e-07, "loss": 0.0001, "num_tokens": 17489251.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12160 }, { "completion_length": 17.4, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 16.175, "completions/mean_terminated_length": 16.175, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.8367711771177118, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0102228112518787, "learning_rate": 3.9530451540160994e-07, "loss": 0.0, "num_tokens": 17505214.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12170 }, { "completion_length": 19.8, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 17.775, "completions/mean_terminated_length": 17.775, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.8374587458745875, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0496548250317574, "learning_rate": 3.920723355827868e-07, "loss": 0.0, "num_tokens": 17520393.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12180 }, { "completion_length": 16.6, "completions/clipped_ratio": 0.0, "completions/max_length": 16.6, "completions/max_terminated_length": 16.6, "completions/mean_length": 15.875, "completions/mean_terminated_length": 15.875, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.8381463146314632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.27838274538517, "learning_rate": 3.888522992200058e-07, "loss": 0.0, "num_tokens": 17533852.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12190 }, { "completion_length": 18.7, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 17.475, "completions/mean_terminated_length": 17.475, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.8388338833883389, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3012643240392208, "learning_rate": 3.856444248634139e-07, "loss": 0.0, "num_tokens": 17546611.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12200 }, { "completion_length": 20.1, "completions/clipped_ratio": 0.0, "completions/max_length": 20.1, "completions/max_terminated_length": 20.1, "completions/mean_length": 18.55, "completions/mean_terminated_length": 18.55, "completions/min_length": 17.1, "completions/min_terminated_length": 17.1, "epoch": 0.8395214521452146, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0045187152922153, "learning_rate": 3.8244873099309604e-07, "loss": 0.0, "num_tokens": 17560577.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12210 }, { "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 16.8, "completions/min_terminated_length": 16.8, "epoch": 0.8402090209020903, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2544339559972286, "learning_rate": 3.792652360189672e-07, "loss": 0.0, "num_tokens": 17573923.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12220 }, { "completion_length": 18.1, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 17.425, "completions/mean_terminated_length": 17.425, "completions/min_length": 16.7, "completions/min_terminated_length": 16.7, "epoch": 0.8408965896589659, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0459356896579266, "learning_rate": 3.7609395828066467e-07, "loss": 0.0, "num_tokens": 17587936.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12230 }, { "completion_length": 16.8, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 15.4, "completions/mean_terminated_length": 15.4, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.8415841584158416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1170457277446986, "learning_rate": 3.72934916047446e-07, "loss": 0.0, "num_tokens": 17602004.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12240 }, { "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.3, "completions/mean_terminated_length": 16.3, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.8422717271727173, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0330356847494842, "learning_rate": 3.6978812751808147e-07, "loss": 0.0, "num_tokens": 17616080.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12250 }, { "completion_length": 20.9, "completions/clipped_ratio": 0.0, "completions/max_length": 20.9, "completions/max_terminated_length": 20.9, "completions/mean_length": 18.325, "completions/mean_terminated_length": 18.325, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.842959295929593, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2558301001787187, "learning_rate": 3.666536108207483e-07, "loss": 0.0, "num_tokens": 17628565.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12260 }, { "completion_length": 16.4, "completions/clipped_ratio": 0.0, "completions/max_length": 16.4, "completions/max_terminated_length": 16.4, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.8436468646864687, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2137410737574101, "learning_rate": 3.635313840129287e-07, "loss": 0.0, "num_tokens": 17644067.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12270 }, { "completion_length": 21.7, "completions/clipped_ratio": 0.0, "completions/max_length": 21.7, "completions/max_terminated_length": 21.7, "completions/mean_length": 18.325, "completions/mean_terminated_length": 18.325, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.8443344334433444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1762503921985625, "learning_rate": 3.604214650813051e-07, "loss": 0.0, "num_tokens": 17659524.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12280 }, { "completion_length": 18.2, "completions/clipped_ratio": 0.0, "completions/max_length": 18.2, "completions/max_terminated_length": 18.2, "completions/mean_length": 16.55, "completions/mean_terminated_length": 16.55, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.8450220022002201, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.343212166428566, "learning_rate": 3.573238719416544e-07, "loss": 0.0, "num_tokens": 17674470.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12290 }, { "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 16.55, "completions/mean_terminated_length": 16.55, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.8457095709570958, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.060416190326214, "learning_rate": 3.5423862243874756e-07, "loss": 0.0, "num_tokens": 17686972.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12300 }, { "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.625, "completions/mean_terminated_length": 17.625, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.8463971397139713, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1023333728313447, "learning_rate": 3.5116573434624655e-07, "loss": 0.0, "num_tokens": 17699097.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12310 }, { "completion_length": 19.3, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 16.825, "completions/mean_terminated_length": 16.825, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.847084708470847, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8042195193633234, "learning_rate": 3.4810522536659925e-07, "loss": 0.0, "num_tokens": 17714886.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12320 }, { "completion_length": 20.2, "completions/clipped_ratio": 0.0, "completions/max_length": 20.2, "completions/max_terminated_length": 20.2, "completions/mean_length": 17.325, "completions/mean_terminated_length": 17.325, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.8477722772277227, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2248048588633538, "learning_rate": 3.450571131309399e-07, "loss": 0.0, "num_tokens": 17728671.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12330 }, { "completion_length": 18.7, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 16.55, "completions/mean_terminated_length": 16.55, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.8484598459845984, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2919159044511617, "learning_rate": 3.4202141519898867e-07, "loss": 0.0, "num_tokens": 17742729.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12340 }, { "completion_length": 17.7, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 16.55, "completions/mean_terminated_length": 16.55, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.8491474147414741, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0141687914729118, "learning_rate": 3.3899814905894583e-07, "loss": 0.0, "num_tokens": 17756071.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12350 }, { "completion_length": 17.2, "completions/clipped_ratio": 0.0, "completions/max_length": 17.2, "completions/max_terminated_length": 17.2, "completions/mean_length": 15.975, "completions/mean_terminated_length": 15.975, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.8498349834983498, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1330483600497245, "learning_rate": 3.3598733212739597e-07, "loss": 0.0, "num_tokens": 17771758.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12360 }, { "completion_length": 19.4, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 17.3, "completions/mean_terminated_length": 17.3, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.8505225522552256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1200291037559509, "learning_rate": 3.32988981749206e-07, "loss": 0.0, "num_tokens": 17786990.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12370 }, { "completion_length": 16.2, "completions/clipped_ratio": 0.0, "completions/max_length": 16.2, "completions/max_terminated_length": 16.2, "completions/mean_length": 14.875, "completions/mean_terminated_length": 14.875, "completions/min_length": 14.3, "completions/min_terminated_length": 14.3, "epoch": 0.8512101210121013, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4351926416158676, "learning_rate": 3.3000311519742283e-07, "loss": 0.0, "num_tokens": 17801433.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12380 }, { "completion_length": 16.4, "completions/clipped_ratio": 0.0, "completions/max_length": 16.4, "completions/max_terminated_length": 16.4, "completions/mean_length": 15.275, "completions/mean_terminated_length": 15.275, "completions/min_length": 14.4, "completions/min_terminated_length": 14.4, "epoch": 0.851897689768977, "frac_reward_zero_std": 1.0, "grad_norm": 8.997012628242373e-05, "kl": 1.3986145965754986, "learning_rate": 3.2702974967317835e-07, "loss": 0.0, "num_tokens": 17815808.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12390 }, { "completion_length": 18.4, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 16.375, "completions/mean_terminated_length": 16.375, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.8525852585258525, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1760932793607934, "learning_rate": 3.240689023055854e-07, "loss": 0.0, "num_tokens": 17828123.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12400 }, { "completion_length": 23.2, "completions/clipped_ratio": 0.0, "completions/max_length": 23.2, "completions/max_terminated_length": 23.2, "completions/mean_length": 19.175, "completions/mean_terminated_length": 19.175, "completions/min_length": 16.8, "completions/min_terminated_length": 16.8, "epoch": 0.8532728272827282, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.105794148147106, "learning_rate": 3.211205901516437e-07, "loss": 0.0, "num_tokens": 17841030.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12410 }, { "completion_length": 20.8, "completions/clipped_ratio": 0.0, "completions/max_length": 20.8, "completions/max_terminated_length": 20.8, "completions/mean_length": 18.6, "completions/mean_terminated_length": 18.6, "completions/min_length": 16.7, "completions/min_terminated_length": 16.7, "epoch": 0.8539603960396039, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0678372394293547, "learning_rate": 3.181848301961388e-07, "loss": 0.0, "num_tokens": 17855914.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12420 }, { "completion_length": 19.3, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 17.725, "completions/mean_terminated_length": 17.725, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.8546479647964796, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0019166065379976, "learning_rate": 3.1526163935154397e-07, "loss": 0.0, "num_tokens": 17871087.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12430 }, { "completion_length": 16.7, "completions/clipped_ratio": 0.0, "completions/max_length": 16.7, "completions/max_terminated_length": 16.7, "completions/mean_length": 15.925, "completions/mean_terminated_length": 15.925, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.8553355335533553, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1093901080545039, "learning_rate": 3.12351034457925e-07, "loss": 0.0, "num_tokens": 17884596.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12440 }, { "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 18.475, "completions/mean_terminated_length": 18.475, "completions/min_length": 16.9, "completions/min_terminated_length": 16.9, "epoch": 0.856023102310231, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.124355961382389, "learning_rate": 3.0945303228284177e-07, "loss": 0.0, "num_tokens": 17899507.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12450 }, { "completion_length": 16.9, "completions/clipped_ratio": 0.0, "completions/max_length": 16.9, "completions/max_terminated_length": 16.9, "completions/mean_length": 15.525, "completions/mean_terminated_length": 15.525, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.8567106710671067, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2013919986784458, "learning_rate": 3.065676495212508e-07, "loss": 0.0, "num_tokens": 17915424.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12460 }, { "completion_length": 18.4, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 16.525, "completions/mean_terminated_length": 16.525, "completions/min_length": 14.8, "completions/min_terminated_length": 14.8, "epoch": 0.8573982398239824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2107804849743844, "learning_rate": 3.036949027954109e-07, "loss": 0.0, "num_tokens": 17930161.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12470 }, { "completion_length": 17.9, "completions/clipped_ratio": 0.0, "completions/max_length": 17.9, "completions/max_terminated_length": 17.9, "completions/mean_length": 16.525, "completions/mean_terminated_length": 16.525, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.858085808580858, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.27331640701741, "learning_rate": 3.0083480865478685e-07, "loss": 0.0, "num_tokens": 17945414.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12480 }, { "completion_length": 17.6, "completions/clipped_ratio": 0.0, "completions/max_length": 17.6, "completions/max_terminated_length": 17.6, "completions/mean_length": 15.625, "completions/mean_terminated_length": 15.625, "completions/min_length": 14.2, "completions/min_terminated_length": 14.2, "epoch": 0.8587733773377337, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9072589740157128, "learning_rate": 2.9798738357595225e-07, "loss": 0.0, "num_tokens": 17958511.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12490 }, { "completion_length": 20.4, "completions/clipped_ratio": 0.0, "completions/max_length": 20.4, "completions/max_terminated_length": 20.4, "completions/mean_length": 17.325, "completions/mean_terminated_length": 17.325, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.8594609460946094, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.260063112527132, "learning_rate": 2.951526439624977e-07, "loss": 0.0, "num_tokens": 17972100.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12500 }, { "completion_length": 16.1, "completions/clipped_ratio": 0.0, "completions/max_length": 16.1, "completions/max_terminated_length": 16.1, "completions/mean_length": 15.05, "completions/mean_terminated_length": 15.05, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.8601485148514851, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.5602857261896133, "learning_rate": 2.923306061449349e-07, "loss": 0.0, "num_tokens": 17986002.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12510 }, { "completion_length": 16.4, "completions/clipped_ratio": 0.0, "completions/max_length": 16.4, "completions/max_terminated_length": 16.4, "completions/mean_length": 15.55, "completions/mean_terminated_length": 15.55, "completions/min_length": 14.8, "completions/min_terminated_length": 14.8, "epoch": 0.8608360836083608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7896676227450371, "learning_rate": 2.8952128638060046e-07, "loss": 0.0, "num_tokens": 17999748.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12520 }, { "completion_length": 19.7, "completions/clipped_ratio": 0.0, "completions/max_length": 19.7, "completions/max_terminated_length": 19.7, "completions/mean_length": 18.075, "completions/mean_terminated_length": 18.075, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.8615236523652365, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3531237110495566, "learning_rate": 2.867247008535659e-07, "loss": 0.0001, "num_tokens": 18014167.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12530 }, { "completion_length": 19.8, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 18.125, "completions/mean_terminated_length": 18.125, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.8622112211221122, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0438602246344089, "learning_rate": 2.8394086567454263e-07, "loss": 0.0, "num_tokens": 18026800.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12540 }, { "completion_length": 20.2, "completions/clipped_ratio": 0.0, "completions/max_length": 20.2, "completions/max_terminated_length": 20.2, "completions/mean_length": 18.825, "completions/mean_terminated_length": 18.825, "completions/min_length": 17.1, "completions/min_terminated_length": 17.1, "epoch": 0.8628987898789879, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9149656176567078, "learning_rate": 2.8116979688078835e-07, "loss": 0.0, "num_tokens": 18042869.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12550 }, { "completion_length": 19.1, "completions/clipped_ratio": 0.0, "completions/max_length": 19.1, "completions/max_terminated_length": 19.1, "completions/mean_length": 16.025, "completions/mean_terminated_length": 16.025, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.8635863586358636, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.06132320612669, "learning_rate": 2.7841151043601576e-07, "loss": 0.0, "num_tokens": 18057798.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12560 }, { "completion_length": 22.3, "completions/clipped_ratio": 0.0, "completions/max_length": 22.3, "completions/max_terminated_length": 22.3, "completions/mean_length": 19.575, "completions/mean_terminated_length": 19.575, "completions/min_length": 17.5, "completions/min_terminated_length": 17.5, "epoch": 0.8642739273927392, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8372251026332378, "learning_rate": 2.756660222303015e-07, "loss": 0.0, "num_tokens": 18070733.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12570 }, { "completion_length": 18.1, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 16.925, "completions/mean_terminated_length": 16.925, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.8649614961496149, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.156419077515602, "learning_rate": 2.7293334807999143e-07, "loss": 0.0, "num_tokens": 18086654.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12580 }, { "completion_length": 18.6, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 16.325, "completions/mean_terminated_length": 16.325, "completions/min_length": 14.3, "completions/min_terminated_length": 14.3, "epoch": 0.8656490649064906, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1113948434591294, "learning_rate": 2.7021350372761325e-07, "loss": 0.0, "num_tokens": 18100699.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12590 }, { "completion_length": 17.4, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 15.85, "completions/mean_terminated_length": 15.85, "completions/min_length": 14.8, "completions/min_terminated_length": 14.8, "epoch": 0.8663366336633663, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2044957906007767, "learning_rate": 2.675065048417838e-07, "loss": 0.0, "num_tokens": 18113257.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12600 }, { "completion_length": 15.8, "completions/clipped_ratio": 0.0, "completions/max_length": 15.8, "completions/max_terminated_length": 15.8, "completions/mean_length": 15.6, "completions/mean_terminated_length": 15.6, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.867024202420242, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.307245521992445, "learning_rate": 2.6481236701711823e-07, "loss": 0.0, "num_tokens": 18125317.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12610 }, { "completion_length": 18.6, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.8677117711771177, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4338117763400078, "learning_rate": 2.621311057741416e-07, "loss": 0.0, "num_tokens": 18139897.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12620 }, { "completion_length": 20.2, "completions/clipped_ratio": 0.0, "completions/max_length": 20.2, "completions/max_terminated_length": 20.2, "completions/mean_length": 17.775, "completions/mean_terminated_length": 17.775, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.8683993399339934, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3830397330224513, "learning_rate": 2.594627365592001e-07, "loss": 0.0001, "num_tokens": 18154572.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12630 }, { "completion_length": 18.4, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 17.3, "completions/mean_terminated_length": 17.3, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.8690869086908691, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3043985791504382, "learning_rate": 2.568072747443681e-07, "loss": 0.0, "num_tokens": 18168840.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12640 }, { "completion_length": 19.2, "completions/clipped_ratio": 0.0, "completions/max_length": 19.2, "completions/max_terminated_length": 19.2, "completions/mean_length": 17.2, "completions/mean_terminated_length": 17.2, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.8697744774477447, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1103583849966525, "learning_rate": 2.541647356273652e-07, "loss": 0.0, "num_tokens": 18183688.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12650 }, { "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.325, "completions/mean_terminated_length": 16.325, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.8704620462046204, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9618835780536756, "learning_rate": 2.51535134431464e-07, "loss": 0.0, "num_tokens": 18198533.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12660 }, { "completion_length": 19.6, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.2, "completions/min_terminated_length": 17.2, "epoch": 0.8711496149614961, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0352514579892158, "learning_rate": 2.489184863054028e-07, "loss": 0.0, "num_tokens": 18211649.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12670 }, { "completion_length": 18.1, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 16.675, "completions/mean_terminated_length": 16.675, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.8718371837183718, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1152288825251162, "learning_rate": 2.4631480632330147e-07, "loss": 0.0, "num_tokens": 18225704.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12680 }, { "completion_length": 21.1, "completions/clipped_ratio": 0.0, "completions/max_length": 21.1, "completions/max_terminated_length": 21.1, "completions/mean_length": 18.6, "completions/mean_terminated_length": 18.6, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.8725247524752475, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0932188533246516, "learning_rate": 2.4372410948456965e-07, "loss": 0.0, "num_tokens": 18239780.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12690 }, { "completion_length": 18.4, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 17.225, "completions/mean_terminated_length": 17.225, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.8732123212321232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3037765175104141, "learning_rate": 2.4114641071382533e-07, "loss": 0.0, "num_tokens": 18254521.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12700 }, { "completion_length": 22.9, "completions/clipped_ratio": 0.0, "completions/max_length": 22.9, "completions/max_terminated_length": 22.9, "completions/mean_length": 20.075, "completions/mean_terminated_length": 20.075, "completions/min_length": 17.2, "completions/min_terminated_length": 17.2, "epoch": 0.8738998899889989, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8483290739357472, "learning_rate": 2.385817248608055e-07, "loss": 0.0, "num_tokens": 18269092.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12710 }, { "completion_length": 19.7, "completions/clipped_ratio": 0.0, "completions/max_length": 19.7, "completions/max_terminated_length": 19.7, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.8745874587458746, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9952517352998257, "learning_rate": 2.3603006670028156e-07, "loss": 0.0, "num_tokens": 18285112.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12720 }, { "completion_length": 21.4, "completions/clipped_ratio": 0.0, "completions/max_length": 21.4, "completions/max_terminated_length": 21.4, "completions/mean_length": 19.875, "completions/mean_terminated_length": 19.875, "completions/min_length": 18.5, "completions/min_terminated_length": 18.5, "epoch": 0.8752750275027503, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3253495544195175, "learning_rate": 2.3349145093197468e-07, "loss": 0.0001, "num_tokens": 18299095.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12730 }, { "completion_length": 16.4, "completions/clipped_ratio": 0.0, "completions/max_length": 16.4, "completions/max_terminated_length": 16.4, "completions/mean_length": 15.275, "completions/mean_terminated_length": 15.275, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.8759625962596259, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3488694623112678, "learning_rate": 2.3096589218047084e-07, "loss": 0.0, "num_tokens": 18312658.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12740 }, { "completion_length": 17.7, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 16.675, "completions/mean_terminated_length": 16.675, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.8766501650165016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9493882581591606, "learning_rate": 2.2845340499513591e-07, "loss": 0.0, "num_tokens": 18327125.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12750 }, { "completion_length": 17.6, "completions/clipped_ratio": 0.0, "completions/max_length": 17.6, "completions/max_terminated_length": 17.6, "completions/mean_length": 16.625, "completions/mean_terminated_length": 16.625, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.8773377337733773, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1807494819164277, "learning_rate": 2.2595400385003324e-07, "loss": 0.0, "num_tokens": 18342442.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12760 }, { "completion_length": 18.6, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 17.225, "completions/mean_terminated_length": 17.225, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.878025302530253, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2734124332666397, "learning_rate": 2.2346770314383925e-07, "loss": 0.0, "num_tokens": 18354299.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12770 }, { "completion_length": 16.7, "completions/clipped_ratio": 0.0, "completions/max_length": 16.7, "completions/max_terminated_length": 16.7, "completions/mean_length": 15.85, "completions/mean_terminated_length": 15.85, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.8787128712871287, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.44075445830822, "learning_rate": 2.2099451719975961e-07, "loss": 0.0, "num_tokens": 18366681.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12780 }, { "completion_length": 15.8, "completions/clipped_ratio": 0.0, "completions/max_length": 15.8, "completions/max_terminated_length": 15.8, "completions/mean_length": 14.95, "completions/mean_terminated_length": 14.95, "completions/min_length": 14.6, "completions/min_terminated_length": 14.6, "epoch": 0.8794004400440044, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3333537511527538, "learning_rate": 2.1853446026544934e-07, "loss": 0.0, "num_tokens": 18379827.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12790 }, { "completion_length": 20.9, "completions/clipped_ratio": 0.0, "completions/max_length": 20.9, "completions/max_terminated_length": 20.9, "completions/mean_length": 17.3, "completions/mean_terminated_length": 17.3, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.8800880088008801, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0007655970752238, "learning_rate": 2.160875465129289e-07, "loss": 0.0, "num_tokens": 18395231.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12800 }, { "completion_length": 19.7, "completions/clipped_ratio": 0.0, "completions/max_length": 19.7, "completions/max_terminated_length": 19.7, "completions/mean_length": 18.65, "completions/mean_terminated_length": 18.65, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.8807755775577558, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0084638118743896, "learning_rate": 2.136537900385016e-07, "loss": 0.0, "num_tokens": 18409345.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12810 }, { "completion_length": 17.4, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 16.125, "completions/mean_terminated_length": 16.125, "completions/min_length": 14.8, "completions/min_terminated_length": 14.8, "epoch": 0.8814631463146315, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1932608442381025, "learning_rate": 2.1123320486267473e-07, "loss": 0.0, "num_tokens": 18423506.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12820 }, { "completion_length": 19.1, "completions/clipped_ratio": 0.0, "completions/max_length": 19.1, "completions/max_terminated_length": 19.1, "completions/mean_length": 17.85, "completions/mean_terminated_length": 17.85, "completions/min_length": 16.9, "completions/min_terminated_length": 16.9, "epoch": 0.8821507150715071, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3098089814186096, "learning_rate": 2.0882580493007844e-07, "loss": 0.0, "num_tokens": 18437276.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12830 }, { "completion_length": 18.1, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.8828382838283828, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3223581999540328, "learning_rate": 2.064316041093828e-07, "loss": 0.0, "num_tokens": 18452850.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12840 }, { "completion_length": 19.7, "completions/clipped_ratio": 0.0, "completions/max_length": 19.7, "completions/max_terminated_length": 19.7, "completions/mean_length": 17.55, "completions/mean_terminated_length": 17.55, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.8835258525852585, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.154933588206768, "learning_rate": 2.0405061619322014e-07, "loss": 0.0, "num_tokens": 18467220.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12850 }, { "completion_length": 19.1, "completions/clipped_ratio": 0.0, "completions/max_length": 19.1, "completions/max_terminated_length": 19.1, "completions/mean_length": 17.075, "completions/mean_terminated_length": 17.075, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.8842134213421342, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3365027002990246, "learning_rate": 2.0168285489810757e-07, "loss": 0.0, "num_tokens": 18481047.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12860 }, { "completion_length": 19.4, "completions/clipped_ratio": 0.0, "completions/max_length": 19.4, "completions/max_terminated_length": 19.4, "completions/mean_length": 17.075, "completions/mean_terminated_length": 17.075, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.8849009900990099, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2854986555874348, "learning_rate": 1.9932833386436257e-07, "loss": 0.0, "num_tokens": 18498298.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12870 }, { "completion_length": 17.2, "completions/clipped_ratio": 0.0, "completions/max_length": 17.2, "completions/max_terminated_length": 17.2, "completions/mean_length": 16.225, "completions/mean_terminated_length": 16.225, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.8855885588558856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1420640468597412, "learning_rate": 1.9698706665602807e-07, "loss": 0.0, "num_tokens": 18513727.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12880 }, { "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 18.4, "completions/mean_terminated_length": 18.4, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.8862761276127613, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.045914850383997, "learning_rate": 1.9465906676079589e-07, "loss": 0.0, "num_tokens": 18527687.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12890 }, { "completion_length": 19.6, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 17.475, "completions/mean_terminated_length": 17.475, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.886963696369637, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2545997768640518, "learning_rate": 1.9234434758992476e-07, "loss": 0.0, "num_tokens": 18541802.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12900 }, { "completion_length": 17.7, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 16.225, "completions/mean_terminated_length": 16.225, "completions/min_length": 14.6, "completions/min_terminated_length": 14.6, "epoch": 0.8876512651265126, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0713786840438844, "learning_rate": 1.900429224781647e-07, "loss": 0.0, "num_tokens": 18557563.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12910 }, { "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.4, "completions/mean_terminated_length": 16.4, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.8883388338833883, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1286379843950272, "learning_rate": 1.8775480468368329e-07, "loss": 0.0, "num_tokens": 18570483.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12920 }, { "completion_length": 18.8, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 17.4, "completions/mean_terminated_length": 17.4, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.889026402640264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1670819714665412, "learning_rate": 1.8548000738798367e-07, "loss": 0.0, "num_tokens": 18583383.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12930 }, { "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 16.55, "completions/mean_terminated_length": 16.55, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.8897139713971397, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9982646837830543, "learning_rate": 1.832185436958328e-07, "loss": 0.0, "num_tokens": 18597357.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12940 }, { "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.675, "completions/mean_terminated_length": 16.675, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.8904015401540154, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1279569923877717, "learning_rate": 1.809704266351861e-07, "loss": 0.0, "num_tokens": 18612940.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12950 }, { "completion_length": 18.2, "completions/clipped_ratio": 0.0, "completions/max_length": 18.2, "completions/max_terminated_length": 18.2, "completions/mean_length": 17.225, "completions/mean_terminated_length": 17.225, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.8910891089108911, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2641742438077928, "learning_rate": 1.7873566915710877e-07, "loss": 0.0, "num_tokens": 18626889.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12960 }, { "completion_length": 15.9, "completions/clipped_ratio": 0.0, "completions/max_length": 15.9, "completions/max_terminated_length": 15.9, "completions/mean_length": 15.45, "completions/mean_terminated_length": 15.45, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.8917766776677668, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3066837146878243, "learning_rate": 1.7651428413570427e-07, "loss": 0.0, "num_tokens": 18640899.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12970 }, { "completion_length": 16.8, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 15.45, "completions/mean_terminated_length": 15.45, "completions/min_length": 14.4, "completions/min_terminated_length": 14.4, "epoch": 0.8924642464246425, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8145103223621846, "learning_rate": 1.7430628436803986e-07, "loss": 0.0, "num_tokens": 18654805.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12980 }, { "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 17.15, "completions/mean_terminated_length": 17.15, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.8931518151815182, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7342884339392185, "learning_rate": 1.7211168257407236e-07, "loss": 0.0, "num_tokens": 18667655.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 12990 }, { "completion_length": 17.3, "completions/clipped_ratio": 0.0, "completions/max_length": 17.3, "completions/max_terminated_length": 17.3, "completions/mean_length": 16.325, "completions/mean_terminated_length": 16.325, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.8938393839383938, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001674629165790975, "kl": 1.1786029890179635, "learning_rate": 1.699304913965738e-07, "loss": 0.0, "num_tokens": 18684740.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13000 }, { "completion_length": 17.9, "completions/clipped_ratio": 0.0, "completions/max_length": 17.9, "completions/max_terminated_length": 17.9, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.8945269526952695, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.6468408346176147, "learning_rate": 1.6776272340106064e-07, "loss": 0.0001, "num_tokens": 18698088.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13010 }, { "completion_length": 17.3, "completions/clipped_ratio": 0.0, "completions/max_length": 17.3, "completions/max_terminated_length": 17.3, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.8952145214521452, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.5087524473667144, "learning_rate": 1.656083910757206e-07, "loss": 0.0, "num_tokens": 18710790.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13020 }, { "completion_length": 17.7, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 16.225, "completions/mean_terminated_length": 16.225, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.8959020902090209, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4609697625041007, "learning_rate": 1.6346750683133933e-07, "loss": 0.0, "num_tokens": 18726031.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13030 }, { "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 17.475, "completions/mean_terminated_length": 17.475, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.8965896589658966, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.086915911734104, "learning_rate": 1.61340083001231e-07, "loss": 0.0, "num_tokens": 18743302.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13040 }, { "completion_length": 15.6, "completions/clipped_ratio": 0.0, "completions/max_length": 15.6, "completions/max_terminated_length": 15.6, "completions/mean_length": 14.55, "completions/mean_terminated_length": 14.55, "completions/min_length": 13.9, "completions/min_terminated_length": 13.9, "epoch": 0.8972772277227723, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3440804041922092, "learning_rate": 1.5922613184116669e-07, "loss": 0.0, "num_tokens": 18758904.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13050 }, { "completion_length": 17.7, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 15.85, "completions/mean_terminated_length": 15.85, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.897964796479648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0510776333510876, "learning_rate": 1.5712566552930187e-07, "loss": 0.0, "num_tokens": 18773934.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13060 }, { "completion_length": 17.4, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.8986523652365237, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0665404554456472, "learning_rate": 1.5503869616610962e-07, "loss": 0.0, "num_tokens": 18787642.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13070 }, { "completion_length": 18.7, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 16.9, "completions/mean_terminated_length": 16.9, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.8993399339933993, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.235348106920719, "learning_rate": 1.5296523577430834e-07, "loss": 0.0, "num_tokens": 18800170.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13080 }, { "completion_length": 17.8, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 16.9, "completions/mean_terminated_length": 16.9, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.900027502750275, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9928356692194938, "learning_rate": 1.509052962987928e-07, "loss": 0.0, "num_tokens": 18813606.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13090 }, { "completion_length": 18.3, "completions/clipped_ratio": 0.0, "completions/max_length": 18.3, "completions/max_terminated_length": 18.3, "completions/mean_length": 16.925, "completions/mean_terminated_length": 16.925, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.9007150715071507, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0992518614977598, "learning_rate": 1.4885888960656653e-07, "loss": 0.0, "num_tokens": 18829635.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13100 }, { "completion_length": 17.9, "completions/clipped_ratio": 0.0, "completions/max_length": 17.9, "completions/max_terminated_length": 17.9, "completions/mean_length": 17.2, "completions/mean_terminated_length": 17.2, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.9014026402640264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0986389875411988, "learning_rate": 1.4682602748667312e-07, "loss": 0.0, "num_tokens": 18845951.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13110 }, { "completion_length": 18.4, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.9020902090209021, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1406767390668393, "learning_rate": 1.4480672165012587e-07, "loss": 0.0, "num_tokens": 18860947.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13120 }, { "completion_length": 19.8, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 16.9, "completions/mean_terminated_length": 16.9, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.9027777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0592786885797978, "learning_rate": 1.428009837298447e-07, "loss": 0.0, "num_tokens": 18875363.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13130 }, { "completion_length": 16.1, "completions/clipped_ratio": 0.0, "completions/max_length": 16.1, "completions/max_terminated_length": 16.1, "completions/mean_length": 15.125, "completions/mean_terminated_length": 15.125, "completions/min_length": 14.2, "completions/min_terminated_length": 14.2, "epoch": 0.9034653465346535, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0585973650217055, "learning_rate": 1.408088252805853e-07, "loss": 0.0, "num_tokens": 18890568.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13140 }, { "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.7, "completions/mean_terminated_length": 18.7, "completions/min_length": 17.7, "completions/min_terminated_length": 17.7, "epoch": 0.9041529152915292, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0279006749391555, "learning_rate": 1.38830257778875e-07, "loss": 0.0, "num_tokens": 18906784.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13150 }, { "completion_length": 18.923076923076923, "completions/clipped_ratio": 0.0, "completions/max_length": 18.923076923076923, "completions/max_terminated_length": 18.923076923076923, "completions/mean_length": 16.423076923076923, "completions/mean_terminated_length": 16.423076923076923, "completions/min_length": 14.615384615384615, "completions/min_terminated_length": 14.615384615384615, "epoch": 0.9048404840484049, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.98125021560834, "learning_rate": 1.3686529262294522e-07, "loss": 0.0, "num_tokens": 18922358.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13160 }, { "completion_length": 19.8, "completions/clipped_ratio": 0.0, "completions/max_length": 19.8, "completions/max_terminated_length": 19.8, "completions/mean_length": 18.125, "completions/mean_terminated_length": 18.125, "completions/min_length": 16.8, "completions/min_terminated_length": 16.8, "epoch": 0.9055280528052805, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8814746301621199, "learning_rate": 1.349139411326661e-07, "loss": 0.0, "num_tokens": 18937807.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13170 }, { "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.925, "completions/mean_terminated_length": 18.925, "completions/min_length": 17.7, "completions/min_terminated_length": 17.7, "epoch": 0.9062156215621562, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.194991097226739, "learning_rate": 1.3297621454948173e-07, "loss": 0.0, "num_tokens": 18952048.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13180 }, { "completion_length": 20.1, "completions/clipped_ratio": 0.0, "completions/max_length": 20.1, "completions/max_terminated_length": 20.1, "completions/mean_length": 17.825, "completions/mean_terminated_length": 17.825, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.9069031903190319, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8364911857992411, "learning_rate": 1.3105212403634605e-07, "loss": 0.0, "num_tokens": 18968381.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13190 }, { "completion_length": 21.8, "completions/clipped_ratio": 0.0, "completions/max_length": 21.8, "completions/max_terminated_length": 21.8, "completions/mean_length": 18.875, "completions/mean_terminated_length": 18.875, "completions/min_length": 16.9, "completions/min_terminated_length": 16.9, "epoch": 0.9075907590759076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1928697779774666, "learning_rate": 1.2914168067765548e-07, "loss": 0.0, "num_tokens": 18981420.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13200 }, { "completion_length": 19.3, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 17.45, "completions/mean_terminated_length": 17.45, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.9082783278327833, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1705792464315892, "learning_rate": 1.272448954791891e-07, "loss": 0.0, "num_tokens": 18996202.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13210 }, { "completion_length": 19.9, "completions/clipped_ratio": 0.0, "completions/max_length": 19.9, "completions/max_terminated_length": 19.9, "completions/mean_length": 18.475, "completions/mean_terminated_length": 18.475, "completions/min_length": 17.4, "completions/min_terminated_length": 17.4, "epoch": 0.908965896589659, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3263706386089325, "learning_rate": 1.2536177936804312e-07, "loss": 0.0001, "num_tokens": 19011981.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13220 }, { "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.85, "completions/mean_terminated_length": 16.85, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.9096534653465347, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2736763119697572, "learning_rate": 1.234923431925672e-07, "loss": 0.0, "num_tokens": 19026291.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13230 }, { "completion_length": 21.8, "completions/clipped_ratio": 0.0, "completions/max_length": 21.8, "completions/max_terminated_length": 21.8, "completions/mean_length": 17.45, "completions/mean_terminated_length": 17.45, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.9103410341034104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9903134420514107, "learning_rate": 1.2163659772230384e-07, "loss": 0.0, "num_tokens": 19038481.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13240 }, { "completion_length": 17.4, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.911028602860286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1821248844265937, "learning_rate": 1.1979455364792559e-07, "loss": 0.0, "num_tokens": 19050137.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13250 }, { "completion_length": 16.9, "completions/clipped_ratio": 0.0, "completions/max_length": 16.9, "completions/max_terminated_length": 16.9, "completions/mean_length": 15.95, "completions/mean_terminated_length": 15.95, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.9117161716171617, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.973328822851181, "learning_rate": 1.1796622158117221e-07, "loss": 0.0, "num_tokens": 19063003.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13260 }, { "completion_length": 16.2, "completions/clipped_ratio": 0.0, "completions/max_length": 16.2, "completions/max_terminated_length": 16.2, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.9124037403740374, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2858654290437699, "learning_rate": 1.1615161205479176e-07, "loss": 0.0, "num_tokens": 19076091.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13270 }, { "completion_length": 17.9, "completions/clipped_ratio": 0.0, "completions/max_length": 17.9, "completions/max_terminated_length": 17.9, "completions/mean_length": 16.075, "completions/mean_terminated_length": 16.075, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.9130913091309131, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1074205577373504, "learning_rate": 1.1435073552247849e-07, "loss": 0.0, "num_tokens": 19090750.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13280 }, { "completion_length": 19.6, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 17.375, "completions/mean_terminated_length": 17.375, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.9137788778877888, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.861525659263134, "learning_rate": 1.1256360235881197e-07, "loss": 0.0, "num_tokens": 19104073.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13290 }, { "completion_length": 17.8, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 16.575, "completions/mean_terminated_length": 16.575, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.9144664466446645, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2444539576768876, "learning_rate": 1.1079022285919977e-07, "loss": 0.0, "num_tokens": 19119956.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13300 }, { "completion_length": 17.6, "completions/clipped_ratio": 0.0, "completions/max_length": 17.6, "completions/max_terminated_length": 17.6, "completions/mean_length": 16.075, "completions/mean_terminated_length": 16.075, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.9151540154015402, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.072230564057827, "learning_rate": 1.0903060723981645e-07, "loss": 0.0, "num_tokens": 19135839.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13310 }, { "completion_length": 17.9, "completions/clipped_ratio": 0.0, "completions/max_length": 17.9, "completions/max_terminated_length": 17.9, "completions/mean_length": 16.375, "completions/mean_terminated_length": 16.375, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.9158415841584159, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0989422407001257, "learning_rate": 1.0728476563754336e-07, "loss": 0.0, "num_tokens": 19146522.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13320 }, { "completion_length": 17.8, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 16.625, "completions/mean_terminated_length": 16.625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.9165291529152916, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1121281802654266, "learning_rate": 1.0555270810991336e-07, "loss": 0.0, "num_tokens": 19161591.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13330 }, { "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.775, "completions/mean_terminated_length": 17.775, "completions/min_length": 16.7, "completions/min_terminated_length": 16.7, "epoch": 0.9172167216721672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3947931826114655, "learning_rate": 1.0383444463505116e-07, "loss": 0.0, "num_tokens": 19175926.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13340 }, { "completion_length": 19.1, "completions/clipped_ratio": 0.0, "completions/max_length": 19.1, "completions/max_terminated_length": 19.1, "completions/mean_length": 17.575, "completions/mean_terminated_length": 17.575, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.9179042904290429, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3302877321839333, "learning_rate": 1.0212998511161504e-07, "loss": 0.0, "num_tokens": 19189601.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13350 }, { "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.9185918591859186, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0459544956684113, "learning_rate": 1.004393393587419e-07, "loss": 0.0, "num_tokens": 19201935.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13360 }, { "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.9192794279427943, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0560489945113658, "learning_rate": 9.876251711598921e-08, "loss": 0.0, "num_tokens": 19216613.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13370 }, { "completion_length": 18.8, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 17.05, "completions/mean_terminated_length": 17.05, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.91996699669967, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.165436042100191, "learning_rate": 9.709952804327848e-08, "loss": 0.0, "num_tokens": 19230839.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13380 }, { "completion_length": 17.2, "completions/clipped_ratio": 0.0, "completions/max_length": 17.2, "completions/max_terminated_length": 17.2, "completions/mean_length": 15.925, "completions/mean_terminated_length": 15.925, "completions/min_length": 15.2, "completions/min_terminated_length": 15.2, "epoch": 0.9206545654565457, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.529273784160614, "learning_rate": 9.545038172084159e-08, "loss": 0.0001, "num_tokens": 19248520.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13390 }, { "completion_length": 17.9, "completions/clipped_ratio": 0.0, "completions/max_length": 17.9, "completions/max_terminated_length": 17.9, "completions/mean_length": 16.9, "completions/mean_terminated_length": 16.9, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.9213421342134214, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3466318234801293, "learning_rate": 9.381508764916419e-08, "loss": 0.0, "num_tokens": 19263628.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13400 }, { "completion_length": 18.2, "completions/clipped_ratio": 0.0, "completions/max_length": 18.2, "completions/max_terminated_length": 18.2, "completions/mean_length": 16.925, "completions/mean_terminated_length": 16.925, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.9220297029702971, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1184769354760646, "learning_rate": 9.219365524892998e-08, "loss": 0.0, "num_tokens": 19279845.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13410 }, { "completion_length": 16.8, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 16.1, "completions/mean_terminated_length": 16.1, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.9227172717271728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.331002026796341, "learning_rate": 9.058609386096956e-08, "loss": 0.0, "num_tokens": 19296153.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13420 }, { "completion_length": 18.8, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 17.35, "completions/mean_terminated_length": 17.35, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.9234048404840484, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1488326989114284, "learning_rate": 8.899241274620246e-08, "loss": 0.0, "num_tokens": 19312639.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13430 }, { "completion_length": 16.8, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 16.4, "completions/mean_terminated_length": 16.4, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.9240924092409241, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4825765755027533, "learning_rate": 8.741262108558829e-08, "loss": 0.0, "num_tokens": 19326283.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13440 }, { "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.075, "completions/mean_terminated_length": 16.075, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.9247799779977998, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2604569263756276, "learning_rate": 8.584672798007038e-08, "loss": 0.0, "num_tokens": 19341090.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13450 }, { "completion_length": 15.9, "completions/clipped_ratio": 0.0, "completions/max_length": 15.9, "completions/max_terminated_length": 15.9, "completions/mean_length": 15.225, "completions/mean_terminated_length": 15.225, "completions/min_length": 14.2, "completions/min_terminated_length": 14.2, "epoch": 0.9254675467546755, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0171996034681796, "learning_rate": 8.429474245052416e-08, "loss": 0.0, "num_tokens": 19355939.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13460 }, { "completion_length": 18.3, "completions/clipped_ratio": 0.0, "completions/max_length": 18.3, "completions/max_terminated_length": 18.3, "completions/mean_length": 16.8, "completions/mean_terminated_length": 16.8, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.9261551155115512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1658393494784831, "learning_rate": 8.275667343770665e-08, "loss": 0.0, "num_tokens": 19370339.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13470 }, { "completion_length": 20.3, "completions/clipped_ratio": 0.0, "completions/max_length": 20.3, "completions/max_terminated_length": 20.3, "completions/mean_length": 18.8, "completions/mean_terminated_length": 18.8, "completions/min_length": 16.8, "completions/min_terminated_length": 16.8, "epoch": 0.9268426842684269, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1757941499352456, "learning_rate": 8.123252980220347e-08, "loss": 0.0, "num_tokens": 19385667.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13480 }, { "completion_length": 19.3, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 16.85, "completions/mean_terminated_length": 16.85, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.9275302530253026, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1713169284164906, "learning_rate": 7.972232032437799e-08, "loss": 0.0, "num_tokens": 19400153.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13490 }, { "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.5, "completions/max_terminated_length": 24.5, "completions/mean_length": 21.925, "completions/mean_terminated_length": 21.925, "completions/min_length": 19.6, "completions/min_terminated_length": 19.6, "epoch": 0.9282178217821783, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9489764098078013, "learning_rate": 7.822605370432113e-08, "loss": 0.0, "num_tokens": 19416986.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13500 }, { "completion_length": 20.7, "completions/clipped_ratio": 0.0, "completions/max_length": 20.7, "completions/max_terminated_length": 20.7, "completions/mean_length": 18.55, "completions/mean_terminated_length": 18.55, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.9289053905390539, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0770463801920414, "learning_rate": 7.67437385618014e-08, "loss": 0.0, "num_tokens": 19434016.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13510 }, { "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.5, "completions/max_terminated_length": 16.5, "completions/mean_length": 15.825, "completions/mean_terminated_length": 15.825, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.9295929592959296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1189071394503116, "learning_rate": 7.527538343621437e-08, "loss": 0.0, "num_tokens": 19448509.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13520 }, { "completion_length": 17.6, "completions/clipped_ratio": 0.0, "completions/max_length": 17.6, "completions/max_terminated_length": 17.6, "completions/mean_length": 16.45, "completions/mean_terminated_length": 16.45, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.9302805280528053, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9916746720671654, "learning_rate": 7.382099678653414e-08, "loss": 0.0, "num_tokens": 19461895.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13530 }, { "completion_length": 19.2, "completions/clipped_ratio": 0.0, "completions/max_length": 19.2, "completions/max_terminated_length": 19.2, "completions/mean_length": 17.125, "completions/mean_terminated_length": 17.125, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.930968096809681, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9048127435147763, "learning_rate": 7.238058699126554e-08, "loss": 0.0, "num_tokens": 19476128.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13540 }, { "completion_length": 17.8, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 16.8, "completions/mean_terminated_length": 16.8, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.9316556655665567, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1816849932074547, "learning_rate": 7.095416234839309e-08, "loss": 0.0, "num_tokens": 19491140.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13550 }, { "completion_length": 17.3, "completions/clipped_ratio": 0.0, "completions/max_length": 17.3, "completions/max_terminated_length": 17.3, "completions/mean_length": 16.275, "completions/mean_terminated_length": 16.275, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.9323432343234324, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1026445545256138, "learning_rate": 6.95417310753363e-08, "loss": 0.0, "num_tokens": 19504803.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13560 }, { "completion_length": 21.3, "completions/clipped_ratio": 0.0, "completions/max_length": 21.3, "completions/max_terminated_length": 21.3, "completions/mean_length": 19.35, "completions/mean_terminated_length": 19.35, "completions/min_length": 17.7, "completions/min_terminated_length": 17.7, "epoch": 0.9330308030803081, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0979745283722877, "learning_rate": 6.814330130890056e-08, "loss": 0.0, "num_tokens": 19518245.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13570 }, { "completion_length": 16.9, "completions/clipped_ratio": 0.0, "completions/max_length": 16.9, "completions/max_terminated_length": 16.9, "completions/mean_length": 16.425, "completions/mean_terminated_length": 16.425, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.9337183718371838, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.169330464117229, "learning_rate": 6.675888110523027e-08, "loss": 0.0, "num_tokens": 19532806.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13580 }, { "completion_length": 16.4, "completions/clipped_ratio": 0.0, "completions/max_length": 16.4, "completions/max_terminated_length": 16.4, "completions/mean_length": 15.475, "completions/mean_terminated_length": 15.475, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.9344059405940595, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.034810755169019, "learning_rate": 6.53884784397632e-08, "loss": 0.0, "num_tokens": 19545657.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13590 }, { "completion_length": 17.7, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 16.9, "completions/mean_terminated_length": 16.9, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.935093509350935, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9879947159439325, "learning_rate": 6.403210120718484e-08, "loss": 0.0, "num_tokens": 19560965.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13600 }, { "completion_length": 17.6, "completions/clipped_ratio": 0.0, "completions/max_length": 17.6, "completions/max_terminated_length": 17.6, "completions/mean_length": 16.65, "completions/mean_terminated_length": 16.65, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.9357810781078107, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3628393903374671, "learning_rate": 6.268975722138082e-08, "loss": 0.0, "num_tokens": 19575915.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13610 }, { "completion_length": 21.7, "completions/clipped_ratio": 0.0, "completions/max_length": 21.7, "completions/max_terminated_length": 21.7, "completions/mean_length": 18.95, "completions/mean_terminated_length": 18.95, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.9364686468646864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.136725478619337, "learning_rate": 6.136145421539424e-08, "loss": 0.0, "num_tokens": 19589693.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13620 }, { "completion_length": 16.8, "completions/clipped_ratio": 0.0, "completions/max_length": 16.8, "completions/max_terminated_length": 16.8, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.1, "completions/min_terminated_length": 14.1, "epoch": 0.9371562156215621, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.384533066302538, "learning_rate": 6.004719984138041e-08, "loss": 0.0, "num_tokens": 19607999.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13630 }, { "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.5, "completions/max_terminated_length": 17.5, "completions/mean_length": 16.8, "completions/mean_terminated_length": 16.8, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.9378437843784379, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0318208899348975, "learning_rate": 5.874700167056213e-08, "loss": 0.0, "num_tokens": 19620139.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13640 }, { "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.025, "completions/mean_terminated_length": 15.025, "completions/min_length": 14.4, "completions/min_terminated_length": 14.4, "epoch": 0.9385313531353136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2272925559431314, "learning_rate": 5.746086719318617e-08, "loss": 0.0, "num_tokens": 19634384.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13650 }, { "completion_length": 17.3, "completions/clipped_ratio": 0.0, "completions/max_length": 17.3, "completions/max_terminated_length": 17.3, "completions/mean_length": 16.1, "completions/mean_terminated_length": 16.1, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.9392189218921893, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3484910354018211, "learning_rate": 5.618880381848157e-08, "loss": 0.0, "num_tokens": 19647180.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13660 }, { "completion_length": 19.3, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 16.775, "completions/mean_terminated_length": 16.775, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.939906490649065, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.254742594063282, "learning_rate": 5.4930818874615045e-08, "loss": 0.0, "num_tokens": 19661903.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13670 }, { "completion_length": 21.2, "completions/clipped_ratio": 0.0, "completions/max_length": 21.2, "completions/max_terminated_length": 21.2, "completions/mean_length": 19.225, "completions/mean_terminated_length": 19.225, "completions/min_length": 17.4, "completions/min_terminated_length": 17.4, "epoch": 0.9405940594059405, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.099889063090086, "learning_rate": 5.368691960864925e-08, "loss": 0.0, "num_tokens": 19676620.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13680 }, { "completion_length": 19.6, "completions/clipped_ratio": 0.0, "completions/max_length": 19.6, "completions/max_terminated_length": 19.6, "completions/mean_length": 17.4, "completions/mean_terminated_length": 17.4, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.9412816281628162, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4136764787137508, "learning_rate": 5.245711318650232e-08, "loss": 0.0001, "num_tokens": 19693448.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13690 }, { "completion_length": 18.7, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 16.975, "completions/mean_terminated_length": 16.975, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.9419691969196919, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2639874435961247, "learning_rate": 5.124140669290512e-08, "loss": 0.0, "num_tokens": 19706119.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13700 }, { "completion_length": 18.8, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 16.825, "completions/mean_terminated_length": 16.825, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.9426567656765676, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.219635272026062, "learning_rate": 5.0039807131360415e-08, "loss": 0.0, "num_tokens": 19719824.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13710 }, { "completion_length": 18.7, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 14.4, "completions/min_terminated_length": 14.4, "epoch": 0.9433443344334433, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1531033888459206, "learning_rate": 4.885232142410462e-08, "loss": 0.0, "num_tokens": 19732704.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13720 }, { "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.175, "completions/mean_terminated_length": 15.175, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.944031903190319, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1203917101025582, "learning_rate": 4.7678956412064435e-08, "loss": 0.0, "num_tokens": 19745163.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13730 }, { "completion_length": 17.7, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 16.45, "completions/mean_terminated_length": 16.45, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.9447194719471947, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1155082169920205, "learning_rate": 4.651971885482054e-08, "loss": 0.0, "num_tokens": 19759317.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13740 }, { "completion_length": 20.7, "completions/clipped_ratio": 0.0, "completions/max_length": 20.7, "completions/max_terminated_length": 20.7, "completions/mean_length": 18.2, "completions/mean_terminated_length": 18.2, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.9454070407040704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2580350935459137, "learning_rate": 4.537461543056676e-08, "loss": 0.0, "num_tokens": 19771657.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13750 }, { "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.05, "completions/mean_terminated_length": 19.05, "completions/min_length": 17.6, "completions/min_terminated_length": 17.6, "epoch": 0.9460946094609461, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.245084148645401, "learning_rate": 4.424365273607345e-08, "loss": 0.0, "num_tokens": 19784435.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13760 }, { "completion_length": 19.3, "completions/clipped_ratio": 0.0, "completions/max_length": 19.3, "completions/max_terminated_length": 19.3, "completions/mean_length": 18.15, "completions/mean_terminated_length": 18.15, "completions/min_length": 17.3, "completions/min_terminated_length": 17.3, "epoch": 0.9467821782178217, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.08096777764149, "learning_rate": 4.312683728664696e-08, "loss": 0.0, "num_tokens": 19799121.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13770 }, { "completion_length": 21.4, "completions/clipped_ratio": 0.0, "completions/max_length": 21.4, "completions/max_terminated_length": 21.4, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.9474697469746974, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.173080563545227, "learning_rate": 4.202417551609383e-08, "loss": 0.0, "num_tokens": 19812087.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13780 }, { "completion_length": 18.7, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 17.55, "completions/mean_terminated_length": 17.55, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.9481573157315731, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2509358823299408, "learning_rate": 4.0935673776684725e-08, "loss": 0.0, "num_tokens": 19826437.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13790 }, { "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.5, "completions/max_terminated_length": 17.5, "completions/mean_length": 15.375, "completions/mean_terminated_length": 15.375, "completions/min_length": 14.2, "completions/min_terminated_length": 14.2, "epoch": 0.9488448844884488, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0518343433737756, "learning_rate": 3.986133833911415e-08, "loss": 0.0, "num_tokens": 19839192.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13800 }, { "completion_length": 18.7, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 16.725, "completions/mean_terminated_length": 16.725, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.9495324532453245, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0350565232336522, "learning_rate": 3.8801175392468584e-08, "loss": 0.0, "num_tokens": 19854041.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13810 }, { "completion_length": 18.1, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 16.9, "completions/mean_terminated_length": 16.9, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.9502200220022002, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2483361944556237, "learning_rate": 3.775519104418812e-08, "loss": 0.0, "num_tokens": 19869149.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13820 }, { "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.775, "completions/mean_terminated_length": 16.775, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.9509075907590759, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2970769941806792, "learning_rate": 3.672339132003211e-08, "loss": 0.0, "num_tokens": 19886136.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13830 }, { "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 18.35, "completions/mean_terminated_length": 18.35, "completions/min_length": 16.3, "completions/min_terminated_length": 16.3, "epoch": 0.9515951595159516, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3883480228483678, "learning_rate": 3.5705782164044135e-08, "loss": 0.0001, "num_tokens": 19897482.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13840 }, { "completion_length": 15.6, "completions/clipped_ratio": 0.0, "completions/max_length": 15.6, "completions/max_terminated_length": 15.6, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.9522827282728272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9344463728368282, "learning_rate": 3.470236943851929e-08, "loss": 0.0, "num_tokens": 19910592.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13850 }, { "completion_length": 18.2, "completions/clipped_ratio": 0.0, "completions/max_length": 18.2, "completions/max_terminated_length": 18.2, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.9529702970297029, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0786833353340626, "learning_rate": 3.371315892396698e-08, "loss": 0.0, "num_tokens": 19924222.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13860 }, { "completion_length": 20.6, "completions/clipped_ratio": 0.0, "completions/max_length": 20.6, "completions/max_terminated_length": 20.6, "completions/mean_length": 18.65, "completions/mean_terminated_length": 18.65, "completions/min_length": 16.7, "completions/min_terminated_length": 16.7, "epoch": 0.9536578657865786, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.420877918601036, "learning_rate": 3.2738156319082336e-08, "loss": 0.0001, "num_tokens": 19939312.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13870 }, { "completion_length": 18.8, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 17.35, "completions/mean_terminated_length": 17.35, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.9543454345434543, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.182000921666622, "learning_rate": 3.1777367240708455e-08, "loss": 0.0, "num_tokens": 19953590.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13880 }, { "completion_length": 18.6, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 17.4, "completions/mean_terminated_length": 17.4, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.95503300330033, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1275596469640732, "learning_rate": 3.0830797223808106e-08, "loss": 0.0, "num_tokens": 19969106.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13890 }, { "completion_length": 20.7, "completions/clipped_ratio": 0.0, "completions/max_length": 20.7, "completions/max_terminated_length": 20.7, "completions/mean_length": 18.1, "completions/mean_terminated_length": 18.1, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.9557205720572057, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4078487813472749, "learning_rate": 2.989845172142958e-08, "loss": 0.0001, "num_tokens": 19985234.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13900 }, { "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.5, "completions/max_terminated_length": 17.5, "completions/mean_length": 16.625, "completions/mean_terminated_length": 16.625, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.9564081408140814, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.7960809737443924, "learning_rate": 2.89803361046756e-08, "loss": 0.0, "num_tokens": 20002139.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13910 }, { "completion_length": 18.8, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 16.975, "completions/mean_terminated_length": 16.975, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.9570957095709571, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9877739049494266, "learning_rate": 2.8076455662673363e-08, "loss": 0.0, "num_tokens": 20019710.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13920 }, { "completion_length": 20.1, "completions/clipped_ratio": 0.0, "completions/max_length": 20.1, "completions/max_terminated_length": 20.1, "completions/mean_length": 18.55, "completions/mean_terminated_length": 18.55, "completions/min_length": 16.9, "completions/min_terminated_length": 16.9, "epoch": 0.9577832783278328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1007904268801212, "learning_rate": 2.7186815602542606e-08, "loss": 0.0, "num_tokens": 20035352.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13930 }, { "completion_length": 18.4, "completions/clipped_ratio": 0.0, "completions/max_length": 18.4, "completions/max_terminated_length": 18.4, "completions/mean_length": 15.9, "completions/mean_terminated_length": 15.9, "completions/min_length": 14.8, "completions/min_terminated_length": 14.8, "epoch": 0.9584708470847084, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0643165530636907, "learning_rate": 2.6311421049366736e-08, "loss": 0.0, "num_tokens": 20046580.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13940 }, { "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.9, "completions/mean_terminated_length": 16.9, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.9591584158415841, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.126106108725071, "learning_rate": 2.5450277046162874e-08, "loss": 0.0, "num_tokens": 20061356.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13950 }, { "completion_length": 19.7, "completions/clipped_ratio": 0.0, "completions/max_length": 19.7, "completions/max_terminated_length": 19.7, "completions/mean_length": 18.05, "completions/mean_terminated_length": 18.05, "completions/min_length": 17.1, "completions/min_terminated_length": 17.1, "epoch": 0.9598459845984598, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9622666202485561, "learning_rate": 2.460338855385297e-08, "loss": 0.0, "num_tokens": 20075018.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13960 }, { "completion_length": 17.2, "completions/clipped_ratio": 0.0, "completions/max_length": 17.2, "completions/max_terminated_length": 17.2, "completions/mean_length": 15.55, "completions/mean_terminated_length": 15.55, "completions/min_length": 14.6, "completions/min_terminated_length": 14.6, "epoch": 0.9605335533553355, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.032901889272034, "learning_rate": 2.3770760451234665e-08, "loss": 0.0, "num_tokens": 20089012.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13970 }, { "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.9612211221122112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3917377760633827, "learning_rate": 2.2952397534954097e-08, "loss": 0.0001, "num_tokens": 20102060.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13980 }, { "completion_length": 17.1, "completions/clipped_ratio": 0.0, "completions/max_length": 17.1, "completions/max_terminated_length": 17.1, "completions/mean_length": 15.55, "completions/mean_terminated_length": 15.55, "completions/min_length": 14.3, "completions/min_terminated_length": 14.3, "epoch": 0.9619086908690869, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2739990446716547, "learning_rate": 2.214830451947786e-08, "loss": 0.0, "num_tokens": 20116294.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 13990 }, { "completion_length": 17.7, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 16.875, "completions/mean_terminated_length": 16.875, "completions/min_length": 16.1, "completions/min_terminated_length": 16.1, "epoch": 0.9625962596259626, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1548074826598167, "learning_rate": 2.1358486037065253e-08, "loss": 0.0, "num_tokens": 20130169.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14000 }, { "completion_length": 17.4, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 15.575, "completions/mean_terminated_length": 15.575, "completions/min_length": 14.6, "completions/min_terminated_length": 14.6, "epoch": 0.9632838283828383, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2935785874724388, "learning_rate": 2.058294663774274e-08, "loss": 0.0, "num_tokens": 20144076.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14010 }, { "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.45, "completions/mean_terminated_length": 15.45, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.963971397139714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0520252301823347, "learning_rate": 1.982169078927676e-08, "loss": 0.0, "num_tokens": 20157418.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14020 }, { "completion_length": 21.7, "completions/clipped_ratio": 0.0, "completions/max_length": 21.7, "completions/max_terminated_length": 21.7, "completions/mean_length": 18.3, "completions/mean_terminated_length": 18.3, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.9646589658965896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0606492094695568, "learning_rate": 1.90747228771479e-08, "loss": 0.0, "num_tokens": 20169898.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14030 }, { "completion_length": 18.7, "completions/clipped_ratio": 0.0, "completions/max_length": 18.7, "completions/max_terminated_length": 18.7, "completions/mean_length": 17.6, "completions/mean_terminated_length": 17.6, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.9653465346534653, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2338533788919448, "learning_rate": 1.8342047204527313e-08, "loss": 0.0, "num_tokens": 20182066.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14040 }, { "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.5, "completions/max_terminated_length": 21.5, "completions/mean_length": 19.175, "completions/mean_terminated_length": 19.175, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.966034103410341, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9971127048134804, "learning_rate": 1.7623667992249516e-08, "loss": 0.0, "num_tokens": 20195993.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14050 }, { "completion_length": 22.6, "completions/clipped_ratio": 0.0, "completions/max_length": 22.6, "completions/max_terminated_length": 22.6, "completions/mean_length": 19.875, "completions/mean_terminated_length": 19.875, "completions/min_length": 17.3, "completions/min_terminated_length": 17.3, "epoch": 0.9667216721672167, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9500480651855469, "learning_rate": 1.691958937879018e-08, "loss": 0.0, "num_tokens": 20210552.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14060 }, { "completion_length": 17.8, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 16.575, "completions/mean_terminated_length": 16.575, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.9674092409240924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3005391888320446, "learning_rate": 1.6229815420240323e-08, "loss": 0.0, "num_tokens": 20225111.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14070 }, { "completion_length": 18.8, "completions/clipped_ratio": 0.0, "completions/max_length": 18.8, "completions/max_terminated_length": 18.8, "completions/mean_length": 17.2, "completions/mean_terminated_length": 17.2, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.9680968096809681, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2837211087346077, "learning_rate": 1.555435009028494e-08, "loss": 0.0, "num_tokens": 20242639.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14080 }, { "completion_length": 19.1, "completions/clipped_ratio": 0.0, "completions/max_length": 19.1, "completions/max_terminated_length": 19.1, "completions/mean_length": 17.55, "completions/mean_terminated_length": 17.55, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.9687843784378438, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1381570495665074, "learning_rate": 1.4893197280178574e-08, "loss": 0.0, "num_tokens": 20256725.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14090 }, { "completion_length": 20.1, "completions/clipped_ratio": 0.0, "completions/max_length": 20.1, "completions/max_terminated_length": 20.1, "completions/mean_length": 17.4, "completions/mean_terminated_length": 17.4, "completions/min_length": 15.9, "completions/min_terminated_length": 15.9, "epoch": 0.9694719471947195, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.20697915032506, "learning_rate": 1.4246360798723657e-08, "loss": 0.0, "num_tokens": 20272581.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14100 }, { "completion_length": 17.8, "completions/clipped_ratio": 0.0, "completions/max_length": 17.8, "completions/max_terminated_length": 17.8, "completions/mean_length": 16.15, "completions/mean_terminated_length": 16.15, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.9701595159515951, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.282748973183334, "learning_rate": 1.3613844372248885e-08, "loss": 0.0, "num_tokens": 20287439.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14110 }, { "completion_length": 21.2, "completions/clipped_ratio": 0.0, "completions/max_length": 21.2, "completions/max_terminated_length": 21.2, "completions/mean_length": 17.925, "completions/mean_terminated_length": 17.925, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.9708470847084708, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0922008886933328, "learning_rate": 1.2995651644586437e-08, "loss": 0.0, "num_tokens": 20302056.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14120 }, { "completion_length": 20.1, "completions/clipped_ratio": 0.0, "completions/max_length": 20.1, "completions/max_terminated_length": 20.1, "completions/mean_length": 17.575, "completions/mean_terminated_length": 17.575, "completions/min_length": 15.7, "completions/min_terminated_length": 15.7, "epoch": 0.9715346534653465, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1506574019789695, "learning_rate": 1.2391786177052278e-08, "loss": 0.0, "num_tokens": 20315039.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14130 }, { "completion_length": 17.1, "completions/clipped_ratio": 0.0, "completions/max_length": 17.1, "completions/max_terminated_length": 17.1, "completions/mean_length": 15.7, "completions/mean_terminated_length": 15.7, "completions/min_length": 14.4, "completions/min_terminated_length": 14.4, "epoch": 0.9722222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.37265754789114, "learning_rate": 1.1802251448424784e-08, "loss": 0.0, "num_tokens": 20329127.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14140 }, { "completion_length": 22.6, "completions/clipped_ratio": 0.0, "completions/max_length": 22.6, "completions/max_terminated_length": 22.6, "completions/mean_length": 19.075, "completions/mean_terminated_length": 19.075, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.9729097909790979, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8400764152407646, "learning_rate": 1.122705085492587e-08, "loss": 0.0, "num_tokens": 20343762.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14150 }, { "completion_length": 18.1, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 16.075, "completions/mean_terminated_length": 16.075, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.9735973597359736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0099975235760212, "learning_rate": 1.0666187710199616e-08, "loss": 0.0, "num_tokens": 20359121.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14160 }, { "completion_length": 21.3, "completions/clipped_ratio": 0.0, "completions/max_length": 21.3, "completions/max_terminated_length": 21.3, "completions/mean_length": 18.05, "completions/mean_terminated_length": 18.05, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.9742849284928493, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.8368704963475466, "learning_rate": 1.0119665245295063e-08, "loss": 0.0, "num_tokens": 20373603.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14170 }, { "completion_length": 18.6, "completions/clipped_ratio": 0.0, "completions/max_length": 18.6, "completions/max_terminated_length": 18.6, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.2, "completions/min_terminated_length": 17.2, "epoch": 0.974972497249725, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.164952965080738, "learning_rate": 9.587486608646224e-09, "loss": 0.0, "num_tokens": 20388985.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14180 }, { "completion_length": 19.9, "completions/clipped_ratio": 0.0, "completions/max_length": 19.9, "completions/max_terminated_length": 19.9, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.9756600660066007, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1645680345594882, "learning_rate": 9.069654866054877e-09, "loss": 0.0, "num_tokens": 20403135.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14190 }, { "completion_length": 15.8, "completions/clipped_ratio": 0.0, "completions/max_length": 15.8, "completions/max_terminated_length": 15.8, "completions/mean_length": 15.2, "completions/mean_terminated_length": 15.2, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.9763476347634763, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9270360007882118, "learning_rate": 8.566173000671696e-09, "loss": 0.0, "num_tokens": 20415259.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14200 }, { "completion_length": 21.4, "completions/clipped_ratio": 0.0, "completions/max_length": 21.4, "completions/max_terminated_length": 21.4, "completions/mean_length": 18.475, "completions/mean_terminated_length": 18.475, "completions/min_length": 15.8, "completions/min_terminated_length": 15.8, "epoch": 0.977035203520352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0703364945948124, "learning_rate": 8.077043912981252e-09, "loss": 0.0, "num_tokens": 20429638.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14210 }, { "completion_length": 18.2, "completions/clipped_ratio": 0.0, "completions/max_length": 18.2, "completions/max_terminated_length": 18.2, "completions/mean_length": 16.8, "completions/mean_terminated_length": 16.8, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.9777227722772277, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3044220611453057, "learning_rate": 7.60227042078232e-09, "loss": 0.0, "num_tokens": 20444002.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14220 }, { "completion_length": 21.9, "completions/clipped_ratio": 0.0, "completions/max_length": 21.9, "completions/max_terminated_length": 21.9, "completions/mean_length": 20.275, "completions/mean_terminated_length": 20.275, "completions/min_length": 18.2, "completions/min_terminated_length": 18.2, "epoch": 0.9784103410341034, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0244949704036117, "learning_rate": 7.141855259174268e-09, "loss": 0.0, "num_tokens": 20456725.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14230 }, { "completion_length": 17.1, "completions/clipped_ratio": 0.0, "completions/max_length": 17.1, "completions/max_terminated_length": 17.1, "completions/mean_length": 16.125, "completions/mean_terminated_length": 16.125, "completions/min_length": 14.9, "completions/min_terminated_length": 14.9, "epoch": 0.9790979097909791, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1022946760058403, "learning_rate": 6.695801080540132e-09, "loss": 0.0, "num_tokens": 20472010.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14240 }, { "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.5, "completions/max_terminated_length": 15.5, "completions/mean_length": 14.8, "completions/mean_terminated_length": 14.8, "completions/min_length": 14.1, "completions/min_terminated_length": 14.1, "epoch": 0.9797854785478548, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3467012587934732, "learning_rate": 6.264110454531347e-09, "loss": 0.0, "num_tokens": 20485502.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14250 }, { "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.4, "completions/min_terminated_length": 15.4, "epoch": 0.9804730473047305, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.389507986046374, "learning_rate": 5.8467858680527625e-09, "loss": 0.0, "num_tokens": 20498382.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14260 }, { "completion_length": 17.3, "completions/clipped_ratio": 0.0, "completions/max_length": 17.3, "completions/max_terminated_length": 17.3, "completions/mean_length": 16.525, "completions/mean_terminated_length": 16.525, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.9811606160616062, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.425867623090744, "learning_rate": 5.443829725249039e-09, "loss": 0.0, "num_tokens": 20515671.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14270 }, { "completion_length": 17.1, "completions/clipped_ratio": 0.0, "completions/max_length": 17.1, "completions/max_terminated_length": 17.1, "completions/mean_length": 15.975, "completions/mean_terminated_length": 15.975, "completions/min_length": 14.6, "completions/min_terminated_length": 14.6, "epoch": 0.9818481848184818, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0982329353690148, "learning_rate": 5.055244347490496e-09, "loss": 0.0, "num_tokens": 20529802.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14280 }, { "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.7, "completions/mean_terminated_length": 18.7, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.9825357535753575, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9369197685271502, "learning_rate": 4.681031973358951e-09, "loss": 0.0, "num_tokens": 20543754.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14290 }, { "completion_length": 17.7, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 16.625, "completions/mean_terminated_length": 16.625, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.9832233223322332, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9515450349077582, "learning_rate": 4.321194758636071e-09, "loss": 0.0, "num_tokens": 20559195.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14300 }, { "completion_length": 17.6, "completions/clipped_ratio": 0.0, "completions/max_length": 17.6, "completions/max_terminated_length": 17.6, "completions/mean_length": 16.425, "completions/mean_terminated_length": 16.425, "completions/min_length": 15.3, "completions/min_terminated_length": 15.3, "epoch": 0.9839108910891089, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0747473061084747, "learning_rate": 3.975734776290319e-09, "loss": 0.0, "num_tokens": 20572748.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14310 }, { "completion_length": 17.7, "completions/clipped_ratio": 0.0, "completions/max_length": 17.7, "completions/max_terminated_length": 17.7, "completions/mean_length": 16.85, "completions/mean_terminated_length": 16.85, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.9845984598459846, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3751868993043899, "learning_rate": 3.6446540164652988e-09, "loss": 0.0, "num_tokens": 20587822.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14320 }, { "completion_length": 17.4, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 15.7, "completions/mean_terminated_length": 15.7, "completions/min_length": 14.7, "completions/min_terminated_length": 14.7, "epoch": 0.9852860286028603, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2880969166755676, "learning_rate": 3.327954386466992e-09, "loss": 0.0, "num_tokens": 20602098.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14330 }, { "completion_length": 15.8, "completions/clipped_ratio": 0.0, "completions/max_length": 15.8, "completions/max_terminated_length": 15.8, "completions/mean_length": 15.425, "completions/mean_terminated_length": 15.425, "completions/min_length": 14.8, "completions/min_terminated_length": 14.8, "epoch": 0.985973597359736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3289981491863727, "learning_rate": 3.0256377107554246e-09, "loss": 0.0, "num_tokens": 20615983.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14340 }, { "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.975, "completions/mean_terminated_length": 16.975, "completions/min_length": 13.9, "completions/min_terminated_length": 13.9, "epoch": 0.9866611661166117, "frac_reward_zero_std": 1.0, "grad_norm": 6.744367419742048e-05, "kl": 1.3962466426193714, "learning_rate": 2.7377057309313502e-09, "loss": 0.0, "num_tokens": 20628250.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14350 }, { "completion_length": 18.3, "completions/clipped_ratio": 0.0, "completions/max_length": 18.3, "completions/max_terminated_length": 18.3, "completions/mean_length": 16.3, "completions/mean_terminated_length": 16.3, "completions/min_length": 15.1, "completions/min_terminated_length": 15.1, "epoch": 0.9873487348734874, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.2089169837534428, "learning_rate": 2.464160105727642e-09, "loss": 0.0, "num_tokens": 20642162.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14360 }, { "completion_length": 19.2, "completions/clipped_ratio": 0.0, "completions/max_length": 19.2, "completions/max_terminated_length": 19.2, "completions/mean_length": 17.275, "completions/mean_terminated_length": 17.275, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.988036303630363, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3353900104761123, "learning_rate": 2.2050024110001345e-09, "loss": 0.0, "num_tokens": 20656053.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14370 }, { "completion_length": 17.4, "completions/clipped_ratio": 0.0, "completions/max_length": 17.4, "completions/max_terminated_length": 17.4, "completions/mean_length": 16.175, "completions/mean_terminated_length": 16.175, "completions/min_length": 15.6, "completions/min_terminated_length": 15.6, "epoch": 0.9887238723872387, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1925065904855727, "learning_rate": 1.9602341397173542e-09, "loss": 0.0, "num_tokens": 20669724.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14380 }, { "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.5, "completions/max_terminated_length": 16.5, "completions/mean_length": 15.575, "completions/mean_terminated_length": 15.575, "completions/min_length": 14.6, "completions/min_terminated_length": 14.6, "epoch": 0.9894114411441144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.3791985176503658, "learning_rate": 1.7298567019527479e-09, "loss": 0.0, "num_tokens": 20687811.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14390 }, { "completion_length": 17.3, "completions/clipped_ratio": 0.0, "completions/max_length": 17.3, "completions/max_terminated_length": 17.3, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.4, "completions/min_terminated_length": 14.4, "epoch": 0.9900990099009901, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.4439594164490699, "learning_rate": 1.513871424876079e-09, "loss": 0.0, "num_tokens": 20703267.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14400 }, { "completion_length": 20.4, "completions/clipped_ratio": 0.0, "completions/max_length": 20.4, "completions/max_terminated_length": 20.4, "completions/mean_length": 18.975, "completions/mean_terminated_length": 18.975, "completions/min_length": 17.6, "completions/min_terminated_length": 17.6, "epoch": 0.9907865786578658, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.0520384900271893, "learning_rate": 1.3122795527467647e-09, "loss": 0.0, "num_tokens": 20715926.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14410 }, { "completion_length": 16.3, "completions/clipped_ratio": 0.0, "completions/max_length": 16.3, "completions/max_terminated_length": 16.3, "completions/mean_length": 15.575, "completions/mean_terminated_length": 15.575, "completions/min_length": 14.8, "completions/min_terminated_length": 14.8, "epoch": 0.9914741474147415, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.1841505281394347, "learning_rate": 1.125082246904996e-09, "loss": 0.0, "num_tokens": 20729649.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14420 }, { "completion_length": 18.1, "completions/clipped_ratio": 0.0, "completions/max_length": 18.1, "completions/max_terminated_length": 18.1, "completions/mean_length": 17.3, "completions/mean_terminated_length": 17.3, "completions/min_length": 16.4, "completions/min_terminated_length": 16.4, "epoch": 0.9921617161716172, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.377218122780323, "learning_rate": 9.52280585766463e-10, "loss": 0.0, "num_tokens": 20747045.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14430 }, { "completion_length": 16.3, "completions/clipped_ratio": 0.0, "completions/max_length": 16.3, "completions/max_terminated_length": 16.3, "completions/mean_length": 15.275, "completions/mean_terminated_length": 15.275, "completions/min_length": 14.4, "completions/min_terminated_length": 14.4, "epoch": 0.9928492849284929, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 1.112162598967552, "learning_rate": 7.938755648156938e-10, "loss": 0.0, "num_tokens": 20758908.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14440 }, { "completion_length": 19.2, "completions/clipped_ratio": 0.0, "completions/max_length": 19.2, "completions/max_terminated_length": 19.2, "completions/mean_length": 16.7, "completions/mean_terminated_length": 16.7, "completions/min_length": 14.4, "completions/min_terminated_length": 14.4, "epoch": 0.9935368536853685, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.9231208987534046, "learning_rate": 6.498680966007809e-10, "loss": 0.0, "num_tokens": 20774668.0, "reward": 4.099999904632568, "reward_std": 0.0, "rewards/coherence_reward_func/mean": 1.2999999523162842, "rewards/coherence_reward_func/std": 0.0, "rewards/formatting_reward_func/mean": 2.0, "rewards/formatting_reward_func/std": 0.0, "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 14450 } ], "logging_steps": 10, "max_steps": 14544, "num_input_tokens_seen": 20774668, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }