diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -2,8987 +2,147 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.020911372149689057, + "epoch": 0.003819126183929117, "eval_steps": 500, - "global_step": 3450, + "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "completion_length": 53.0, + "completion_length": 4.0, "completions/clipped_ratio": 0.0, - "completions/max_length": 53.0, - "completions/max_terminated_length": 53.0, - "completions/mean_length": 47.61666666666667, - "completions/mean_terminated_length": 47.61666666666667, - "completions/min_length": 41.13333333333333, - "completions/min_terminated_length": 41.13333333333333, - "epoch": 6.061267289764944e-05, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.19573745131492615, - "kl": 0.038476434412101905, - "learning_rate": 4.903154239845798e-06, + "completions/max_length": 4.0, + "completions/max_terminated_length": 4.0, + "completions/mean_length": 3.875, + "completions/mean_terminated_length": 3.875, + "completions/min_length": 3.7, + "completions/min_terminated_length": 3.7, + "epoch": 0.0007638252367858234, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 5.584762921273523e-06, + "learning_rate": 3.4351145038167944e-08, "loss": 0.0, - "num_tokens": 14686.0, - "reward": 3.297205893198649, - "reward_std": 0.13904121598849695, - "rewards/coherence_reward_func/mean": 0.8483333309491475, - "rewards/coherence_reward_func/std": 0.03666666845480601, - "rewards/formatting_reward_func/mean": 1.3, + "num_tokens": 14515.0, + "reward": 0.40999999046325686, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 0.12999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 0.2, "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.1488724946975708, - "rewards/quality_reward_func/std": 0.10623545634249847, + "rewards/quality_reward_func/mean": 0.0800000011920929, + "rewards/quality_reward_func/std": 0.0, "step": 10 }, { - "completion_length": 67.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 67.8, - "completions/max_terminated_length": 67.8, - "completions/mean_length": 60.6, - "completions/mean_terminated_length": 60.6, - "completions/min_length": 51.1, - "completions/min_terminated_length": 51.1, - "epoch": 0.00012122534579529888, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.14363239705562592, - "kl": 0.0513855260447599, - "learning_rate": 3.897982258676867e-06, - "loss": 0.0, - "num_tokens": 29694.0, - "reward": 3.987208294868469, - "reward_std": 0.07321195515105501, - "rewards/coherence_reward_func/mean": 0.9899999976158143, - "rewards/coherence_reward_func/std": 0.020000000298023225, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.497208344936371, - "rewards/quality_reward_func/std": 0.05723555360455066, + "completion_length": 6.7, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6.7, + "completions/max_terminated_length": 6.7, + "completions/mean_length": 5.575, + "completions/mean_terminated_length": 5.575, + "completions/min_length": 4.3, + "completions/min_terminated_length": 4.3, + "epoch": 0.0015276504735716467, + "frac_reward_zero_std": 0.9, + "grad_norm": 0.0, + "kl": 1.2543402516485002e-05, + "learning_rate": 7.251908396946566e-08, + "loss": 0.0, + "num_tokens": 26178.0, + "reward": 1.024999976158142, + "reward_std": 0.23671360015869142, + "rewards/coherence_reward_func/mean": 0.32499998807907104, + "rewards/coherence_reward_func/std": 0.07505553364753723, + "rewards/formatting_reward_func/mean": 0.5, + "rewards/formatting_reward_func/std": 0.1154700517654419, + "rewards/quality_reward_func/mean": 0.20000000298023224, + "rewards/quality_reward_func/std": 0.046188023686408994, "step": 20 }, { - "completion_length": 60.7, - "completions/clipped_ratio": 0.0, - "completions/max_length": 60.7, - "completions/max_terminated_length": 60.7, - "completions/mean_length": 51.8, - "completions/mean_terminated_length": 51.8, - "completions/min_length": 43.5, - "completions/min_terminated_length": 43.5, - "epoch": 0.00018183801869294832, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.3984941244125366, - "kl": 0.052938600437482816, - "learning_rate": 2.238678841830867e-06, - "loss": 0.0, - "num_tokens": 45670.0, - "reward": 3.331803727149963, - "reward_std": 0.12095312587916851, - "rewards/coherence_reward_func/mean": 0.8925000011920929, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.35, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.089303719997406, - "rewards/quality_reward_func/std": 0.11160956136882305, + "completion_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3.0, + "completions/max_terminated_length": 3.0, + "completions/mean_length": 1.5, + "completions/mean_terminated_length": 1.5, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.00229147571035747, + "frac_reward_zero_std": 0.9, + "grad_norm": 0.0, + "kl": 1.8039132919511756e-05, + "learning_rate": 1.1068702290076337e-07, + "loss": 0.0, + "num_tokens": 40390.0, + "reward": 0.10249999761581421, + "reward_std": 0.20499999523162843, + "rewards/coherence_reward_func/mean": 0.032499998807907104, + "rewards/coherence_reward_func/std": 0.06499999761581421, + "rewards/formatting_reward_func/mean": 0.05, + "rewards/formatting_reward_func/std": 0.1, + "rewards/quality_reward_func/mean": 0.020000000298023225, + "rewards/quality_reward_func/std": 0.04000000059604645, "step": 30 }, { - "completion_length": 69.1, + "completion_length": 6.9, "completions/clipped_ratio": 0.0, - "completions/max_length": 69.1, - "completions/max_terminated_length": 69.1, - "completions/mean_length": 55.575, - "completions/mean_terminated_length": 55.575, - "completions/min_length": 45.8, - "completions/min_terminated_length": 45.8, - "epoch": 0.00024245069159059776, - "frac_reward_zero_std": 0.1, + "completions/max_length": 6.9, + "completions/max_terminated_length": 6.9, + "completions/mean_length": 6.3, + "completions/mean_terminated_length": 6.3, + "completions/min_length": 5.6, + "completions/min_terminated_length": 5.6, + "epoch": 0.0030553009471432934, + "frac_reward_zero_std": 0.9, "grad_norm": 0.0, - "kl": 0.047772943903692064, - "learning_rate": 7.016504991533727e-07, + "kl": 4.868981123611338e-05, + "learning_rate": 1.4885496183206107e-07, "loss": 0.0, - "num_tokens": 59741.0, - "reward": 3.563346338272095, - "reward_std": 0.15187854310497642, - "rewards/coherence_reward_func/mean": 0.8899999976158142, - "rewards/coherence_reward_func/std": 0.020000000298023225, - "rewards/formatting_reward_func/mean": 1.35, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3233463883399963, - "rewards/quality_reward_func/std": 0.14236470488831401, + "num_tokens": 54058.0, + "reward": 0.9599999785423279, + "reward_std": 0.16165807247161865, + "rewards/coherence_reward_func/mean": 0.32499998807907104, + "rewards/coherence_reward_func/std": 0.07505553364753723, + "rewards/formatting_reward_func/mean": 0.475, + "rewards/formatting_reward_func/std": 0.08660253882408142, + "rewards/quality_reward_func/mean": 0.1600000023841858, + "rewards/quality_reward_func/std": 0.0, "step": 40 }, { - "completion_length": 50.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 50.8, - "completions/max_terminated_length": 50.8, - "completions/mean_length": 42.825, - "completions/mean_terminated_length": 42.825, - "completions/min_length": 34.3, - "completions/min_terminated_length": 34.3, - "epoch": 0.0003030633644882472, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.33391332626342773, - "kl": 0.11284379730932415, - "learning_rate": 6.089874350439507e-09, - "loss": 0.0, - "num_tokens": 71218.0, - "reward": 3.385823893547058, - "reward_std": 0.3162759563419968, - "rewards/coherence_reward_func/mean": 0.9100000023841858, - "rewards/coherence_reward_func/std": 0.08000000417232514, - "rewards/formatting_reward_func/mean": 1.3875, - "rewards/formatting_reward_func/std": 0.075, - "rewards/quality_reward_func/mean": 1.0883238837122917, - "rewards/quality_reward_func/std": 0.16834406293928622, + "completion_length": 2.2, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2.2, + "completions/max_terminated_length": 2.2, + "completions/mean_length": 1.3, + "completions/mean_terminated_length": 1.3, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.003819126183929117, + "frac_reward_zero_std": 0.9, + "grad_norm": 0.0, + "kl": 4.7134438500506806e-05, + "learning_rate": 1.870229007633588e-07, + "loss": 0.0, + "num_tokens": 67990.0, + "reward": 0.10249999761581421, + "reward_std": 0.20499999523162843, + "rewards/coherence_reward_func/mean": 0.032499998807907104, + "rewards/coherence_reward_func/std": 0.06499999761581421, + "rewards/formatting_reward_func/mean": 0.05, + "rewards/formatting_reward_func/std": 0.1, + "rewards/quality_reward_func/mean": 0.020000000298023225, + "rewards/quality_reward_func/std": 0.04000000059604645, "step": 50 - }, - { - "completion_length": 45.3, - "completions/clipped_ratio": 0.0, - "completions/max_length": 45.3, - "completions/max_terminated_length": 45.3, - "completions/mean_length": 40.05, - "completions/mean_terminated_length": 40.05, - "completions/min_length": 36.3, - "completions/min_terminated_length": 36.3, - "epoch": 0.00036367603738589663, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.1630372256040573, - "kl": 0.059176677470168214, - "learning_rate": 1.475e-06, - "loss": 0.0, - "num_tokens": 82576.0, - "reward": 2.584429931640625, - "reward_std": 0.11261246949434281, - "rewards/coherence_reward_func/mean": 0.6925000011920929, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.05, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 0.8419298708438874, - "rewards/quality_reward_func/std": 0.10122912768274546, - "step": 60 - }, - { - "completion_length": 52.4, - "completions/clipped_ratio": 0.0, - "completions/max_length": 52.4, - "completions/max_terminated_length": 52.4, - "completions/mean_length": 46.425, - "completions/mean_terminated_length": 46.425, - "completions/min_length": 40.8, - "completions/min_terminated_length": 40.8, - "epoch": 0.0004242887102835461, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.26949363946914673, - "kl": 0.05641823921396281, - "learning_rate": 1.725e-06, - "loss": 0.0, - "num_tokens": 96621.0, - "reward": 3.2763977527618406, - "reward_std": 0.12580769900232552, - "rewards/coherence_reward_func/mean": 0.8625, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.35, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.0638977468013764, - "rewards/quality_reward_func/std": 0.13974314024671913, - "step": 70 - }, - { - "completion_length": 56.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 56.2, - "completions/max_terminated_length": 56.2, - "completions/mean_length": 47.55, - "completions/mean_terminated_length": 47.55, - "completions/min_length": 39.8, - "completions/min_terminated_length": 39.8, - "epoch": 0.0004849013831811955, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.0, - "kl": 0.06750868092640303, - "learning_rate": 1.975e-06, - "loss": 0.0, - "num_tokens": 112195.0, - "reward": 3.154419016838074, - "reward_std": 0.25714875645935537, - "rewards/coherence_reward_func/mean": 0.8425000011920929, - "rewards/coherence_reward_func/std": 0.07273502796888351, - "rewards/formatting_reward_func/mean": 1.275, - "rewards/formatting_reward_func/std": 0.08660253882408142, - "rewards/quality_reward_func/mean": 1.036919003725052, - "rewards/quality_reward_func/std": 0.11499775731936097, - "step": 80 - }, - { - "completion_length": 48.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 48.1, - "completions/max_terminated_length": 48.1, - "completions/mean_length": 42.8, - "completions/mean_terminated_length": 42.8, - "completions/min_length": 38.5, - "completions/min_terminated_length": 38.5, - "epoch": 0.000545514056078845, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.3837050795555115, - "kl": 0.09911922943801983, - "learning_rate": 2.2250000000000003e-06, - "loss": 0.0, - "num_tokens": 123383.0, - "reward": 3.6385270595550536, - "reward_std": 0.3143979568965733, - "rewards/coherence_reward_func/mean": 0.9449999988079071, - "rewards/coherence_reward_func/std": 0.05, - "rewards/formatting_reward_func/mean": 1.4625, - "rewards/formatting_reward_func/std": 0.075, - "rewards/quality_reward_func/mean": 1.2310271203517913, - "rewards/quality_reward_func/std": 0.1893980055116117, - "step": 90 - }, - { - "completion_length": 50.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 50.6, - "completions/max_terminated_length": 50.6, - "completions/mean_length": 45.55, - "completions/mean_terminated_length": 45.55, - "completions/min_length": 40.5, - "completions/min_terminated_length": 40.5, - "epoch": 0.0006061267289764944, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.0, - "kl": 0.05494948438672509, - "learning_rate": 2.475e-06, - "loss": 0.0, - "num_tokens": 136841.0, - "reward": 3.539727210998535, - "reward_std": 0.10118589112535119, - "rewards/coherence_reward_func/mean": 0.9, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.35, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2897271931171417, - "rewards/quality_reward_func/std": 0.10118585228919982, - "step": 100 - }, - { - "completion_length": 41.0625, - "completions/clipped_ratio": 0.0, - "completions/max_length": 41.0625, - "completions/max_terminated_length": 41.0625, - "completions/mean_length": 37.03125, - "completions/mean_terminated_length": 37.03125, - "completions/min_length": 31.0625, - "completions/min_terminated_length": 31.0625, - "epoch": 0.0006667394018741438, - "frac_reward_zero_std": 0.0625, - "grad_norm": 0.0, - "kl": 0.06317420813138597, - "learning_rate": 2.7250000000000006e-06, - "loss": 0.0, - "num_tokens": 150046.0, - "reward": 3.615771561861038, - "reward_std": 0.1497760786442086, - "rewards/coherence_reward_func/mean": 0.9156249985098839, - "rewards/coherence_reward_func/std": 0.04374999925494194, - "rewards/formatting_reward_func/mean": 1.40625, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2938964813947678, - "rewards/quality_reward_func/std": 0.10602607735199854, - "step": 110 - }, - { - "completion_length": 38.3, - "completions/clipped_ratio": 0.0, - "completions/max_length": 38.3, - "completions/max_terminated_length": 38.3, - "completions/mean_length": 33.875, - "completions/mean_terminated_length": 33.875, - "completions/min_length": 29.9, - "completions/min_terminated_length": 29.9, - "epoch": 0.0007273520747717933, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.29878297448158264, - "kl": 0.08165747453313088, - "learning_rate": 2.9750000000000003e-06, - "loss": 0.0, - "num_tokens": 160801.0, - "reward": 2.973150110244751, - "reward_std": 0.14958924502134324, - "rewards/coherence_reward_func/mean": 0.8, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.2, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 0.9731501996517181, - "rewards/quality_reward_func/std": 0.14958928674459457, - "step": 120 - }, - { - "completion_length": 53.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 53.9, - "completions/max_terminated_length": 53.9, - "completions/mean_length": 45.95, - "completions/mean_terminated_length": 45.95, - "completions/min_length": 39.3, - "completions/min_terminated_length": 39.3, - "epoch": 0.0007879647476694427, - "frac_reward_zero_std": 0.0, - "grad_norm": 0.21056820452213287, - "kl": 0.1461421002750285, - "learning_rate": 3.2250000000000005e-06, - "loss": 0.0, - "num_tokens": 176099.0, - "reward": 3.463565444946289, - "reward_std": 0.34478279487229885, - "rewards/coherence_reward_func/mean": 0.9149999976158142, - "rewards/coherence_reward_func/std": 0.07000000029802322, - "rewards/formatting_reward_func/mean": 1.3875, - "rewards/formatting_reward_func/std": 0.075, - "rewards/quality_reward_func/mean": 1.161065411567688, - "rewards/quality_reward_func/std": 0.20001683793962002, - "step": 130 - }, - { - "completion_length": 52.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 52.1, - "completions/max_terminated_length": 52.1, - "completions/mean_length": 44.55, - "completions/mean_terminated_length": 44.55, - "completions/min_length": 36.9, - "completions/min_terminated_length": 36.9, - "epoch": 0.0008485774205670922, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.21922150254249573, - "kl": 0.07869280752702253, - "learning_rate": 3.475e-06, - "loss": 0.0, - "num_tokens": 191953.0, - "reward": 3.339912676811218, - "reward_std": 0.06815968146547675, - "rewards/coherence_reward_func/mean": 0.8774999976158142, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.35, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.112412703037262, - "rewards/quality_reward_func/std": 0.06322385780513287, - "step": 140 - }, - { - "completion_length": 40.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 40.6, - "completions/max_terminated_length": 40.6, - "completions/mean_length": 35.225, - "completions/mean_terminated_length": 35.225, - "completions/min_length": 29.6, - "completions/min_terminated_length": 29.6, - "epoch": 0.0009091900934647416, - "frac_reward_zero_std": 0.0, - "grad_norm": 0.23550187051296234, - "kl": 0.21491210646927356, - "learning_rate": 3.7250000000000003e-06, - "loss": 0.0, - "num_tokens": 208230.0, - "reward": 3.599507009983063, - "reward_std": 0.3249947665259242, - "rewards/coherence_reward_func/mean": 0.9199999988079071, - "rewards/coherence_reward_func/std": 0.05773502588272095, - "rewards/formatting_reward_func/mean": 1.425, - "rewards/formatting_reward_func/std": 0.08660253882408142, - "rewards/quality_reward_func/mean": 1.2545070230960846, - "rewards/quality_reward_func/std": 0.1829237760975957, - "step": 150 - }, - { - "completion_length": 42.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 42.1, - "completions/max_terminated_length": 42.1, - "completions/mean_length": 36.325, - "completions/mean_terminated_length": 36.325, - "completions/min_length": 31.6, - "completions/min_terminated_length": 31.6, - "epoch": 0.000969802766362391, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.2995178699493408, - "kl": 0.24042115244665183, - "learning_rate": 3.975000000000001e-06, - "loss": 0.0, - "num_tokens": 220263.0, - "reward": 3.6113248109817504, - "reward_std": 0.10339606767520308, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.1413248002529144, - "rewards/quality_reward_func/std": 0.10339611349627376, - "step": 160 - }, - { - "completion_length": 47.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 47.8, - "completions/max_terminated_length": 47.8, - "completions/mean_length": 41.375, - "completions/mean_terminated_length": 41.375, - "completions/min_length": 35.9, - "completions/min_terminated_length": 35.9, - "epoch": 0.0010304154392600405, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.21306632459163666, - "kl": 0.09573557204566896, - "learning_rate": 4.225e-06, - "loss": 0.0, - "num_tokens": 234170.0, - "reward": 3.968274688720703, - "reward_std": 0.1419311842881143, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4682746946811676, - "rewards/quality_reward_func/std": 0.1419312173151411, - "step": 170 - }, - { - "completion_length": 57.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 57.8, - "completions/max_terminated_length": 57.8, - "completions/mean_length": 52.725, - "completions/mean_terminated_length": 52.725, - "completions/min_length": 47.0, - "completions/min_terminated_length": 47.0, - "epoch": 0.00109102811215769, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.34620681405067444, - "kl": 0.10235883156419731, - "learning_rate": 4.475e-06, - "loss": 0.0, - "num_tokens": 246011.0, - "reward": 3.922975492477417, - "reward_std": 0.0824978705495596, - "rewards/coherence_reward_func/mean": 0.9700000047683716, - "rewards/coherence_reward_func/std": 0.020000000298023225, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4529755979776382, - "rewards/quality_reward_func/std": 0.07113206386566162, - "step": 180 - }, - { - "completion_length": 43.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 43.6, - "completions/max_terminated_length": 43.6, - "completions/mean_length": 38.975, - "completions/mean_terminated_length": 38.975, - "completions/min_length": 34.0, - "completions/min_terminated_length": 34.0, - "epoch": 0.0011516407850553393, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.3032384514808655, - "kl": 0.24063811488449574, - "learning_rate": 4.7250000000000005e-06, - "loss": 0.0, - "num_tokens": 257678.0, - "reward": 3.8886915683746337, - "reward_std": 0.06705415202304721, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3886915802955628, - "rewards/quality_reward_func/std": 0.06705412231385707, - "step": 190 - }, - { - "completion_length": 55.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 55.0, - "completions/max_terminated_length": 55.0, - "completions/mean_length": 49.225, - "completions/mean_terminated_length": 49.225, - "completions/min_length": 44.7, - "completions/min_terminated_length": 44.7, - "epoch": 0.0012122534579529888, - "frac_reward_zero_std": 0.0, - "grad_norm": 0.14065846800804138, - "kl": 0.10155629244400188, - "learning_rate": 4.975000000000001e-06, - "loss": 0.0, - "num_tokens": 267063.0, - "reward": 3.7897242307662964, - "reward_std": 0.09715260849334299, - "rewards/coherence_reward_func/mean": 0.9850000023841858, - "rewards/coherence_reward_func/std": 0.017320507764816286, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3047242045402527, - "rewards/quality_reward_func/std": 0.08471100898459553, - "step": 200 - }, - { - "completion_length": 52.7, - "completions/clipped_ratio": 0.0, - "completions/max_length": 52.7, - "completions/max_terminated_length": 52.7, - "completions/mean_length": 45.975, - "completions/mean_terminated_length": 45.975, - "completions/min_length": 40.6, - "completions/min_terminated_length": 40.6, - "epoch": 0.0012728661308506382, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.33726778626441956, - "kl": 0.15650667499285192, - "learning_rate": 4.9996915812041515e-06, - "loss": 0.0, - "num_tokens": 279706.0, - "reward": 3.7803353548049925, - "reward_std": 0.1521661963313818, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2803353130817414, - "rewards/quality_reward_func/std": 0.1521662008948624, - "step": 210 - }, - { - "completion_length": 41.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 41.6, - "completions/max_terminated_length": 41.6, - "completions/mean_length": 38.7, - "completions/mean_terminated_length": 38.7, - "completions/min_length": 35.2, - "completions/min_terminated_length": 35.2, - "epoch": 0.0013334788037482877, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.21357978880405426, - "kl": 0.2635789274936542, - "learning_rate": 4.998625539854394e-06, - "loss": 0.0, - "num_tokens": 293146.0, - "reward": 3.7800986051559446, - "reward_std": 0.11134308017790318, - "rewards/coherence_reward_func/mean": 0.9899999976158143, - "rewards/coherence_reward_func/std": 0.020000000298023225, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2900986403226853, - "rewards/quality_reward_func/std": 0.09898405130952596, - "step": 220 - }, - { - "completion_length": 55.4, - "completions/clipped_ratio": 0.0, - "completions/max_length": 55.4, - "completions/max_terminated_length": 55.4, - "completions/mean_length": 51.325, - "completions/mean_terminated_length": 51.325, - "completions/min_length": 48.9, - "completions/min_terminated_length": 48.9, - "epoch": 0.0013940914766459372, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.2249079793691635, - "kl": 0.14070038970094173, - "learning_rate": 4.996798392960466e-06, - "loss": 0.0, - "num_tokens": 307951.0, - "reward": 3.8009377479553224, - "reward_std": 0.13956304402090608, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3309377551078796, - "rewards/quality_reward_func/std": 0.13956300741992891, - "step": 230 - }, - { - "completion_length": 53.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 53.6, - "completions/max_terminated_length": 53.6, - "completions/mean_length": 46.575, - "completions/mean_terminated_length": 46.575, - "completions/min_length": 41.5, - "completions/min_terminated_length": 41.5, - "epoch": 0.0014547041495435865, - "frac_reward_zero_std": 0.0, - "grad_norm": 0.13777810335159302, - "kl": 0.21170329675078392, - "learning_rate": 4.9942106970890136e-06, - "loss": 0.0, - "num_tokens": 322158.0, - "reward": 3.703891324996948, - "reward_std": 0.1922367438673973, - "rewards/coherence_reward_func/mean": 0.9700000047683716, - "rewards/coherence_reward_func/std": 0.03464101552963257, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2338913679122925, - "rewards/quality_reward_func/std": 0.17463979609310626, - "step": 240 - }, - { - "completion_length": 47.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 47.2, - "completions/max_terminated_length": 47.2, - "completions/mean_length": 41.55, - "completions/mean_terminated_length": 41.55, - "completions/min_length": 37.8, - "completions/min_terminated_length": 37.8, - "epoch": 0.001515316822441236, - "frac_reward_zero_std": 0.0, - "grad_norm": 0.17371144890785217, - "kl": 0.33162173330783845, - "learning_rate": 4.990863240477266e-06, - "loss": 0.0, - "num_tokens": 336512.0, - "reward": 3.8270575046539306, - "reward_std": 0.13017863007262348, - "rewards/coherence_reward_func/mean": 0.9850000023841858, - "rewards/coherence_reward_func/std": 0.017320507764816286, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3420575439929963, - "rewards/quality_reward_func/std": 0.13162680566310883, - "step": 250 - }, - { - "completion_length": 46.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 46.9, - "completions/max_terminated_length": 46.9, - "completions/mean_length": 42.4, - "completions/mean_terminated_length": 42.4, - "completions/min_length": 37.2, - "completions/min_terminated_length": 37.2, - "epoch": 0.0015759294953388854, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.0, - "kl": 0.29076083535328506, - "learning_rate": 4.9867570427929356e-06, - "loss": 0.0, - "num_tokens": 350200.0, - "reward": 3.8159981250762938, - "reward_std": 0.0735640264581889, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3159981489181518, - "rewards/quality_reward_func/std": 0.07356401411816478, - "step": 260 - }, - { - "completion_length": 49.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 49.5, - "completions/max_terminated_length": 49.5, - "completions/mean_length": 45.65, - "completions/mean_terminated_length": 45.65, - "completions/min_length": 42.4, - "completions/min_terminated_length": 42.4, - "epoch": 0.0016365421682365349, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.2593402564525604, - "kl": 0.3106656195130199, - "learning_rate": 4.981893354823614e-06, - "loss": 0.0, - "num_tokens": 365882.0, - "reward": 4.030669736862182, - "reward_std": 0.08622031668201088, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.530669742822647, - "rewards/quality_reward_func/std": 0.08622029158286751, - "step": 270 - }, - { - "completion_length": 50.7, - "completions/clipped_ratio": 0.0, - "completions/max_length": 50.7, - "completions/max_terminated_length": 50.7, - "completions/mean_length": 46.75, - "completions/mean_terminated_length": 46.75, - "completions/min_length": 42.9, - "completions/min_terminated_length": 42.9, - "epoch": 0.0016971548411341844, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.0, - "kl": 0.24476897111162543, - "learning_rate": 4.976273658095772e-06, - "loss": 0.0, - "num_tokens": 380144.0, - "reward": 3.9191980838775633, - "reward_std": 0.14327773749828338, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4491980969905853, - "rewards/quality_reward_func/std": 0.14327771924436092, - "step": 280 - }, - { - "completion_length": 40.4, - "completions/clipped_ratio": 0.0, - "completions/max_length": 40.4, - "completions/max_terminated_length": 40.4, - "completions/mean_length": 38.6, - "completions/mean_terminated_length": 38.6, - "completions/min_length": 36.5, - "completions/min_terminated_length": 36.5, - "epoch": 0.0017577675140318337, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.16908203065395355, - "kl": 0.28217653911560775, - "learning_rate": 4.969899664423473e-06, - "loss": 0.0, - "num_tokens": 394932.0, - "reward": 3.6636978149414063, - "reward_std": 0.146779540553689, - "rewards/coherence_reward_func/mean": 0.9550000011920929, - "rewards/coherence_reward_func/std": 0.017320507764816286, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2086978763341905, - "rewards/quality_reward_func/std": 0.14611819349229335, - "step": 290 - }, - { - "completion_length": 39.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.8, - "completions/max_terminated_length": 39.8, - "completions/mean_length": 36.9, - "completions/mean_terminated_length": 36.9, - "completions/min_length": 33.7, - "completions/min_terminated_length": 33.7, - "epoch": 0.0018183801869294833, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.0, - "kl": 0.35093524772673845, - "learning_rate": 4.962773315386935e-06, - "loss": 0.0, - "num_tokens": 406512.0, - "reward": 3.508596086502075, - "reward_std": 0.11237592563265934, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.0085960745811462, - "rewards/quality_reward_func/std": 0.11237595231505111, - "step": 300 - }, - { - "completion_length": 42.7, - "completions/clipped_ratio": 0.0, - "completions/max_length": 42.7, - "completions/max_terminated_length": 42.7, - "completions/mean_length": 40.275, - "completions/mean_terminated_length": 40.275, - "completions/min_length": 37.2, - "completions/min_terminated_length": 37.2, - "epoch": 0.0018789928598271326, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.18939845263957977, - "kl": 0.21862519478891046, - "learning_rate": 4.95489678174111e-06, - "loss": 0.0, - "num_tokens": 419563.0, - "reward": 3.6325401306152343, - "reward_std": 0.12741643376648426, - "rewards/coherence_reward_func/mean": 0.9824999988079071, - "rewards/coherence_reward_func/std": 0.034999999403953555, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.1500401258468629, - "rewards/quality_reward_func/std": 0.0930970230139792, - "step": 310 - }, - { - "completion_length": 39.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.0, - "completions/max_terminated_length": 39.0, - "completions/mean_length": 36.525, - "completions/mean_terminated_length": 36.525, - "completions/min_length": 33.5, - "completions/min_terminated_length": 33.5, - "epoch": 0.001939605532724782, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.0, - "kl": 0.29974672324024143, - "learning_rate": 4.946272462754447e-06, - "loss": 0.0, - "num_tokens": 431832.0, - "reward": 3.6941110134124755, - "reward_std": 0.07270973902195692, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.1941110283136367, - "rewards/quality_reward_func/std": 0.07270971238613129, - "step": 320 - }, - { - "completion_length": 46.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 46.8, - "completions/max_terminated_length": 46.8, - "completions/mean_length": 43.7, - "completions/mean_terminated_length": 43.7, - "completions/min_length": 40.7, - "completions/min_terminated_length": 40.7, - "epoch": 0.0020002182056224316, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.21079392731189728, - "kl": 0.4155863583087921, - "learning_rate": 4.936902985478055e-06, - "loss": 0.0, - "num_tokens": 446104.0, - "reward": 3.979234480857849, - "reward_std": 0.12566649727523327, - "rewards/coherence_reward_func/mean": 0.9925000011920929, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4867344886064529, - "rewards/quality_reward_func/std": 0.14065655674785377, - "step": 330 - }, - { - "completion_length": 45.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 45.1, - "completions/max_terminated_length": 45.1, - "completions/mean_length": 42.425, - "completions/mean_terminated_length": 42.425, - "completions/min_length": 40.3, - "completions/min_terminated_length": 40.3, - "epoch": 0.002060830878520081, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.12001561373472214, - "kl": 0.37258960865437984, - "learning_rate": 4.926791203945477e-06, - "loss": 0.0, - "num_tokens": 461125.0, - "reward": 3.7795443773269652, - "reward_std": 0.10013196812942624, - "rewards/coherence_reward_func/mean": 0.9550000011920929, - "rewards/coherence_reward_func/std": 0.017320507764816286, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3245443999767303, - "rewards/quality_reward_func/std": 0.08734173709526658, - "step": 340 - }, - { - "completion_length": 49.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 49.0, - "completions/max_terminated_length": 49.0, - "completions/mean_length": 45.7, - "completions/mean_terminated_length": 45.7, - "completions/min_length": 43.0, - "completions/min_terminated_length": 43.0, - "epoch": 0.0021214435514177303, - "frac_reward_zero_std": 0.0, - "grad_norm": 0.2889994978904724, - "kl": 0.40418021026998757, - "learning_rate": 4.915940198303324e-06, - "loss": 0.0, - "num_tokens": 476057.0, - "reward": 3.9986944437026977, - "reward_std": 0.08654631108511239, - "rewards/coherence_reward_func/mean": 0.9549999952316284, - "rewards/coherence_reward_func/std": 0.030000004172325134, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.5436944842338562, - "rewards/quality_reward_func/std": 0.05698021114803851, - "step": 350 - }, - { - "completion_length": 42.8125, - "completions/clipped_ratio": 0.0, - "completions/max_length": 42.8125, - "completions/max_terminated_length": 42.8125, - "completions/mean_length": 38.9375, - "completions/mean_terminated_length": 38.9375, - "completions/min_length": 34.625, - "completions/min_terminated_length": 34.625, - "epoch": 0.00218205622431538, - "frac_reward_zero_std": 0.125, - "grad_norm": 0.0, - "kl": 0.41664854681584984, - "learning_rate": 4.904353273873029e-06, - "loss": 0.0, - "num_tokens": 489356.0, - "reward": 3.773276299238205, - "reward_std": 0.12748435913817957, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2732763346284628, - "rewards/quality_reward_func/std": 0.1274843459832482, - "step": 360 - }, - { - "completion_length": 33.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 33.5, - "completions/max_terminated_length": 33.5, - "completions/mean_length": 31.025, - "completions/mean_terminated_length": 31.025, - "completions/min_length": 28.8, - "completions/min_terminated_length": 28.8, - "epoch": 0.0022426688972130293, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.33312687277793884, - "kl": 0.5843813385814428, - "learning_rate": 4.89203396014402e-06, - "loss": 0.0, - "num_tokens": 500589.0, - "reward": 3.6452549934387206, - "reward_std": 0.06399927549064159, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.1752550661563874, - "rewards/quality_reward_func/std": 0.06399932391941547, - "step": 370 - }, - { - "completion_length": 51.4, - "completions/clipped_ratio": 0.0, - "completions/max_length": 51.4, - "completions/max_terminated_length": 51.4, - "completions/mean_length": 46.95, - "completions/mean_terminated_length": 46.95, - "completions/min_length": 41.7, - "completions/min_terminated_length": 41.7, - "epoch": 0.0023032815701106786, - "frac_reward_zero_std": 0.0, - "grad_norm": 0.1448577642440796, - "kl": 0.31766308564692736, - "learning_rate": 4.878986009698596e-06, - "loss": 0.0, - "num_tokens": 514519.0, - "reward": 4.020467591285706, - "reward_std": 0.0818719930946827, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.5204676389694214, - "rewards/quality_reward_func/std": 0.08187194960191846, - "step": 380 - }, - { - "completion_length": 42.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 42.2, - "completions/max_terminated_length": 42.2, - "completions/mean_length": 38.45, - "completions/mean_terminated_length": 38.45, - "completions/min_length": 34.7, - "completions/min_terminated_length": 34.7, - "epoch": 0.0023638942430083284, - "frac_reward_zero_std": 0.0, - "grad_norm": 0.14577415585517883, - "kl": 0.49632331132888796, - "learning_rate": 4.865213397068864e-06, - "loss": 0.0, - "num_tokens": 525401.0, - "reward": 3.7501391172409058, - "reward_std": 0.11387707404792309, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2501392006874084, - "rewards/quality_reward_func/std": 0.11387709490954875, - "step": 390 - }, - { - "completion_length": 50.7, - "completions/clipped_ratio": 0.0, - "completions/max_length": 50.7, - "completions/max_terminated_length": 50.7, - "completions/mean_length": 44.3, - "completions/mean_terminated_length": 44.3, - "completions/min_length": 39.0, - "completions/min_terminated_length": 39.0, - "epoch": 0.0024245069159059777, - "frac_reward_zero_std": 0.0, - "grad_norm": 0.18155062198638916, - "kl": 0.5144314333796501, - "learning_rate": 4.850720317526047e-06, - "loss": 0.0, - "num_tokens": 535485.0, - "reward": 3.7378342390060424, - "reward_std": 0.37232128186151386, - "rewards/coherence_reward_func/mean": 0.975, - "rewards/coherence_reward_func/std": 0.05, - "rewards/formatting_reward_func/mean": 1.4625, - "rewards/formatting_reward_func/std": 0.075, - "rewards/quality_reward_func/mean": 1.3003341734409333, - "rewards/quality_reward_func/std": 0.24743340583518147, - "step": 400 - }, - { - "completion_length": 39.46153846153846, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.46153846153846, - "completions/max_terminated_length": 39.46153846153846, - "completions/mean_length": 36.05769230769231, - "completions/mean_terminated_length": 36.05769230769231, - "completions/min_length": 33.53846153846154, - "completions/min_terminated_length": 33.53846153846154, - "epoch": 0.002485119588803627, - "frac_reward_zero_std": 0.38461538461538464, - "grad_norm": 0.0, - "kl": 0.7394945420897924, - "learning_rate": 4.835511185802574e-06, - "loss": 0.0001, - "num_tokens": 547769.0, - "reward": 3.7616154047159047, - "reward_std": 0.06627743204052632, - "rewards/coherence_reward_func/mean": 0.959615381864401, - "rewards/coherence_reward_func/std": 0.011538463143201975, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3020000457763672, - "rewards/quality_reward_func/std": 0.05473898666409346, - "step": 410 - }, - { - "completion_length": 46.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 46.9, - "completions/max_terminated_length": 46.9, - "completions/mean_length": 41.3, - "completions/mean_terminated_length": 41.3, - "completions/min_length": 36.2, - "completions/min_terminated_length": 36.2, - "epoch": 0.0025457322617012763, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.11636363714933395, - "kl": 0.5952509582042694, - "learning_rate": 4.8195906347473e-06, - "loss": 0.0, - "num_tokens": 559765.0, - "reward": 3.9452152967453005, - "reward_std": 0.0958477696403861, - "rewards/coherence_reward_func/mean": 0.9824999988079071, - "rewards/coherence_reward_func/std": 0.034999999403953555, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.462715357542038, - "rewards/quality_reward_func/std": 0.10085176508873701, - "step": 420 - }, - { - "completion_length": 47.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 47.5, - "completions/max_terminated_length": 47.5, - "completions/mean_length": 41.75, - "completions/mean_terminated_length": 41.75, - "completions/min_length": 35.8, - "completions/min_terminated_length": 35.8, - "epoch": 0.002606344934598926, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.19162562489509583, - "kl": 0.6374471843242645, - "learning_rate": 4.802963513914304e-06, - "loss": 0.0001, - "num_tokens": 574647.0, - "reward": 3.8188122510910034, - "reward_std": 0.07948591830208898, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3188122510910034, - "rewards/quality_reward_func/std": 0.07948592221364378, - "step": 430 - }, - { - "completion_length": 43.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 43.8, - "completions/max_terminated_length": 43.8, - "completions/mean_length": 41.9, - "completions/mean_terminated_length": 41.9, - "completions/min_length": 39.9, - "completions/min_terminated_length": 39.9, - "epoch": 0.0026669576074965754, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.2602149546146393, - "kl": 0.5979329001158475, - "learning_rate": 4.7856348880856595e-06, - "loss": 0.0, - "num_tokens": 588211.0, - "reward": 3.6818951606750487, - "reward_std": 0.07032759645953775, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.181895188987255, - "rewards/quality_reward_func/std": 0.07032758523710073, - "step": 440 - }, - { - "completion_length": 49.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 49.5, - "completions/max_terminated_length": 49.5, - "completions/mean_length": 45.2, - "completions/mean_terminated_length": 45.2, - "completions/min_length": 41.0, - "completions/min_terminated_length": 41.0, - "epoch": 0.0027275702803942247, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.11991872638463974, - "kl": 0.6710507554933429, - "learning_rate": 4.767610035728663e-06, - "loss": 0.0001, - "num_tokens": 600303.0, - "reward": 3.713807559013367, - "reward_std": 0.13497604615986347, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2138075202703476, - "rewards/quality_reward_func/std": 0.134976077824831, - "step": 450 - }, - { - "completion_length": 45.05555555555556, - "completions/clipped_ratio": 0.0, - "completions/max_length": 45.05555555555556, - "completions/max_terminated_length": 45.05555555555556, - "completions/mean_length": 42.486111111111114, - "completions/mean_terminated_length": 42.486111111111114, - "completions/min_length": 40.166666666666664, - "completions/min_terminated_length": 40.166666666666664, - "epoch": 0.0027881829532918744, - "frac_reward_zero_std": 0.2222222222222222, - "grad_norm": 0.27657774090766907, - "kl": 0.413224925700989, - "learning_rate": 4.7488944473879515e-06, - "loss": 0.0, - "num_tokens": 613243.0, - "reward": 3.9092971218956842, - "reward_std": 0.07577277444458257, - "rewards/coherence_reward_func/mean": 0.9902777771155039, - "rewards/coherence_reward_func/std": 0.019444444113307528, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.419019364648395, - "rewards/quality_reward_func/std": 0.05844140792679456, - "step": 460 - }, - { - "completion_length": 36.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 36.5, - "completions/max_terminated_length": 36.5, - "completions/mean_length": 33.575, - "completions/mean_terminated_length": 33.575, - "completions/min_length": 31.1, - "completions/min_terminated_length": 31.1, - "epoch": 0.0028487956261895237, - "frac_reward_zero_std": 0.0, - "grad_norm": 0.22307322919368744, - "kl": 0.5246760591864585, - "learning_rate": 4.729493824013036e-06, - "loss": 0.0, - "num_tokens": 627118.0, - "reward": 3.798800802230835, - "reward_std": 0.07457497492432594, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2988007545471192, - "rewards/quality_reward_func/std": 0.07457494381815195, - "step": 470 - }, - { - "completion_length": 42.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 42.1, - "completions/max_terminated_length": 42.1, - "completions/mean_length": 40.425, - "completions/mean_terminated_length": 40.425, - "completions/min_length": 39.0, - "completions/min_terminated_length": 39.0, - "epoch": 0.002909408299087173, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.1554323434829712, - "kl": 0.4022949608042836, - "learning_rate": 4.709414075221734e-06, - "loss": 0.0, - "num_tokens": 640487.0, - "reward": 4.0466917753219604, - "reward_std": 0.08580413579475135, - "rewards/coherence_reward_func/mean": 0.9824999988079071, - "rewards/coherence_reward_func/std": 0.034999999403953555, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.5641917943954469, - "rewards/quality_reward_func/std": 0.054607686214149, - "step": 480 - }, - { - "completion_length": 41.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 41.9, - "completions/max_terminated_length": 41.9, - "completions/mean_length": 39.2, - "completions/mean_terminated_length": 39.2, - "completions/min_length": 36.0, - "completions/min_terminated_length": 36.0, - "epoch": 0.002970020971984823, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.17072752118110657, - "kl": 0.45513674058020115, - "learning_rate": 4.688661317500045e-06, - "loss": 0.0, - "num_tokens": 650843.0, - "reward": 3.897800660133362, - "reward_std": 0.14646206023171543, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3978005647659302, - "rewards/quality_reward_func/std": 0.14646205129101872, - "step": 490 - }, - { - "completion_length": 48.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 48.1, - "completions/max_terminated_length": 48.1, - "completions/mean_length": 45.9, - "completions/mean_terminated_length": 45.9, - "completions/min_length": 43.8, - "completions/min_terminated_length": 43.8, - "epoch": 0.003030633644882472, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.05200561136007309, - "kl": 0.42439612178131936, - "learning_rate": 4.667241872339007e-06, - "loss": 0.0, - "num_tokens": 663787.0, - "reward": 3.6736547708511353, - "reward_std": 0.05465184841305017, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2036547794938088, - "rewards/quality_reward_func/std": 0.05465186738874763, - "step": 500 - }, - { - "completion_length": 43.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 43.8, - "completions/max_terminated_length": 43.8, - "completions/mean_length": 39.8, - "completions/mean_terminated_length": 39.8, - "completions/min_length": 37.1, - "completions/min_terminated_length": 37.1, - "epoch": 0.0030912463177801214, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.11436619609594345, - "kl": 0.3797212293371558, - "learning_rate": 4.645162264309112e-06, - "loss": 0.0, - "num_tokens": 681563.0, - "reward": 3.493495297431946, - "reward_std": 0.14218758614733815, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 0.9934952914714813, - "rewards/quality_reward_func/std": 0.14218762149102987, - "step": 510 - }, - { - "completion_length": 50.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 50.2, - "completions/max_terminated_length": 50.2, - "completions/mean_length": 45.2, - "completions/mean_terminated_length": 45.2, - "completions/min_length": 40.5, - "completions/min_terminated_length": 40.5, - "epoch": 0.0031518589906777707, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.0, - "kl": 0.4402745446190238, - "learning_rate": 4.622429219072854e-06, - "loss": 0.0, - "num_tokens": 696175.0, - "reward": 3.8434906959533692, - "reward_std": 0.10095033422112465, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.343490782380104, - "rewards/quality_reward_func/std": 0.10095035880804062, - "step": 520 - }, - { - "completion_length": 40.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 40.9, - "completions/max_terminated_length": 40.9, - "completions/mean_length": 37.45, - "completions/mean_terminated_length": 37.45, - "completions/min_length": 34.3, - "completions/min_terminated_length": 34.3, - "epoch": 0.0032124716635754205, - "frac_reward_zero_std": 0.0, - "grad_norm": 0.17916260659694672, - "kl": 0.40265289824455974, - "learning_rate": 4.599049661336033e-06, - "loss": 0.0, - "num_tokens": 709453.0, - "reward": 3.7069475173950197, - "reward_std": 0.1299577643861994, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2069474697113036, - "rewards/quality_reward_func/std": 0.12995776808820664, - "step": 530 - }, - { - "completion_length": 39.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.5, - "completions/max_terminated_length": 39.5, - "completions/mean_length": 36.275, - "completions/mean_terminated_length": 36.275, - "completions/min_length": 34.0, - "completions/min_terminated_length": 34.0, - "epoch": 0.0032730843364730698, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.0, - "kl": 0.3699711788445711, - "learning_rate": 4.5750307127384194e-06, - "loss": 0.0, - "num_tokens": 723384.0, - "reward": 3.491497802734375, - "reward_std": 0.07357308231294155, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 0.9914978355169296, - "rewards/quality_reward_func/std": 0.07357308473438025, - "step": 540 - }, - { - "completion_length": 45.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 45.2, - "completions/max_terminated_length": 45.2, - "completions/mean_length": 41.625, - "completions/mean_terminated_length": 41.625, - "completions/min_length": 38.4, - "completions/min_terminated_length": 38.4, - "epoch": 0.003333697009370719, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.20592492818832397, - "kl": 0.3234907558187842, - "learning_rate": 4.550379689684431e-06, - "loss": 0.0, - "num_tokens": 734109.0, - "reward": 3.987918663024902, - "reward_std": 0.0583965809782967, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.487918609380722, - "rewards/quality_reward_func/std": 0.05839657171163708, - "step": 550 - }, - { - "completion_length": 44.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 44.6, - "completions/max_terminated_length": 44.6, - "completions/mean_length": 41.275, - "completions/mean_terminated_length": 41.275, - "completions/min_length": 38.0, - "completions/min_terminated_length": 38.0, - "epoch": 0.003394309682268369, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.0, - "kl": 0.3340648262761533, - "learning_rate": 4.5251041011144905e-06, - "loss": 0.0, - "num_tokens": 745636.0, - "reward": 3.9144543409347534, - "reward_std": 0.04685788853093982, - "rewards/coherence_reward_func/mean": 0.9925000011920929, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4219542920589447, - "rewards/quality_reward_func/std": 0.03252085950225592, - "step": 560 - }, - { - "completion_length": 35.4, - "completions/clipped_ratio": 0.0, - "completions/max_length": 35.4, - "completions/max_terminated_length": 35.4, - "completions/mean_length": 34.125, - "completions/mean_terminated_length": 34.125, - "completions/min_length": 32.2, - "completions/min_terminated_length": 32.2, - "epoch": 0.003454922355166018, - "frac_reward_zero_std": 0.6, - "grad_norm": 0.0, - "kl": 0.43019469641149044, - "learning_rate": 4.4992116462177274e-06, - "loss": 0.0, - "num_tokens": 759337.0, - "reward": 3.8797792911529543, - "reward_std": 0.10142343789339066, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4097792938351632, - "rewards/quality_reward_func/std": 0.10142342448234558, - "step": 570 - }, - { - "completion_length": 41.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 41.6, - "completions/max_terminated_length": 41.6, - "completions/mean_length": 37.9, - "completions/mean_terminated_length": 37.9, - "completions/min_length": 32.5, - "completions/min_terminated_length": 32.5, - "epoch": 0.0035155350280636675, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.23039837181568146, - "kl": 0.33485878612846137, - "learning_rate": 4.4727102120867274e-06, - "loss": 0.0, - "num_tokens": 770765.0, - "reward": 3.962643551826477, - "reward_std": 0.08133783508092166, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.462643551826477, - "rewards/quality_reward_func/std": 0.08133785370737315, - "step": 580 - }, - { - "completion_length": 36.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 36.0, - "completions/max_terminated_length": 36.0, - "completions/mean_length": 33.45, - "completions/mean_terminated_length": 33.45, - "completions/min_length": 31.5, - "completions/min_terminated_length": 31.5, - "epoch": 0.003576147700961317, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.15077538788318634, - "kl": 0.4403778428211808, - "learning_rate": 4.445607871315053e-06, - "loss": 0.0, - "num_tokens": 783155.0, - "reward": 3.712204694747925, - "reward_std": 0.10083713037893176, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2122047781944274, - "rewards/quality_reward_func/std": 0.1008371202275157, - "step": 590 - }, - { - "completion_length": 41.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 41.9, - "completions/max_terminated_length": 41.9, - "completions/mean_length": 39.625, - "completions/mean_terminated_length": 39.625, - "completions/min_length": 36.8, - "completions/min_terminated_length": 36.8, - "epoch": 0.0036367603738589665, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.206932932138443, - "kl": 0.5848230581730605, - "learning_rate": 4.41791287953825e-06, - "loss": 0.0, - "num_tokens": 797140.0, - "reward": 3.7731836318969725, - "reward_std": 0.07502549570053815, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3031836465001105, - "rewards/quality_reward_func/std": 0.07502551740035415, - "step": 600 - }, - { - "completion_length": 34.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 34.5, - "completions/max_terminated_length": 34.5, - "completions/mean_length": 31.525, - "completions/mean_terminated_length": 31.525, - "completions/min_length": 29.0, - "completions/min_terminated_length": 29.0, - "epoch": 0.003697373046756616, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.30379366874694824, - "kl": 0.5692525085061788, - "learning_rate": 4.389633672919099e-06, - "loss": 0.0, - "num_tokens": 807213.0, - "reward": 3.4554595232009886, - "reward_std": 0.1391330706886947, - "rewards/coherence_reward_func/mean": 0.9625, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 0.9929595589637756, - "rewards/quality_reward_func/std": 0.13733365600928665, - "step": 610 - }, - { - "completion_length": 38.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 38.2, - "completions/max_terminated_length": 38.2, - "completions/mean_length": 33.325, - "completions/mean_terminated_length": 33.325, - "completions/min_length": 28.4, - "completions/min_terminated_length": 28.4, - "epoch": 0.003757985719654265, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.0, - "kl": 0.4783481229096651, - "learning_rate": 4.360778865577885e-06, - "loss": 0.0, - "num_tokens": 818782.0, - "reward": 3.5967469215393066, - "reward_std": 0.15170758068561555, - "rewards/coherence_reward_func/mean": 0.9925000011920929, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.1042468905448914, - "rewards/quality_reward_func/std": 0.13676653672009706, - "step": 620 - }, - { - "completion_length": 37.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 37.5, - "completions/max_terminated_length": 37.5, - "completions/mean_length": 33.975, - "completions/mean_terminated_length": 33.975, - "completions/min_length": 31.0, - "completions/min_terminated_length": 31.0, - "epoch": 0.003818598392551915, - "frac_reward_zero_std": 0.0, - "grad_norm": 0.2585483491420746, - "kl": 0.6426176808774471, - "learning_rate": 4.331357246968447e-06, - "loss": 0.0, - "num_tokens": 833361.0, - "reward": 3.4105613946914675, - "reward_std": 0.1392153210937977, - "rewards/coherence_reward_func/mean": 0.9474999964237213, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 0.9630613967776298, - "rewards/quality_reward_func/std": 0.12421534410677851, - "step": 630 - }, - { - "completion_length": 41.4, - "completions/clipped_ratio": 0.0, - "completions/max_length": 41.4, - "completions/max_terminated_length": 41.4, - "completions/mean_length": 38.5, - "completions/mean_terminated_length": 38.5, - "completions/min_length": 36.3, - "completions/min_terminated_length": 36.3, - "epoch": 0.003879211065449564, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.29496175050735474, - "kl": 0.5105690762400628, - "learning_rate": 4.301377779200826e-06, - "loss": 0.0, - "num_tokens": 847265.0, - "reward": 3.8103545665740968, - "reward_std": 0.12261493569239974, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3403544902801514, - "rewards/quality_reward_func/std": 0.12261495834682137, - "step": 640 - }, - { - "completion_length": 39.7, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.7, - "completions/max_terminated_length": 39.7, - "completions/mean_length": 35.65, - "completions/mean_terminated_length": 35.65, - "completions/min_length": 32.1, - "completions/min_terminated_length": 32.1, - "epoch": 0.0039398237383472135, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.15986734628677368, - "kl": 0.40894179232418537, - "learning_rate": 4.270849594311323e-06, - "loss": 0.0, - "num_tokens": 857571.0, - "reward": 3.853004574775696, - "reward_std": 0.17061512358486652, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.353004515171051, - "rewards/quality_reward_func/std": 0.17061512283980845, - "step": 650 - }, - { - "completion_length": 43.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 43.6, - "completions/max_terminated_length": 43.6, - "completions/mean_length": 42.2, - "completions/mean_terminated_length": 42.2, - "completions/min_length": 41.2, - "completions/min_terminated_length": 41.2, - "epoch": 0.004000436411244863, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.5791295766830444, - "kl": 0.42834788020700215, - "learning_rate": 4.239781991480786e-06, - "loss": 0.0, - "num_tokens": 870847.0, - "reward": 3.93058660030365, - "reward_std": 0.04339735861867666, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4305866301059722, - "rewards/quality_reward_func/std": 0.043397351447492835, - "step": 660 - }, - { - "completion_length": 47.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 47.0, - "completions/max_terminated_length": 47.0, - "completions/mean_length": 41.525, - "completions/mean_terminated_length": 41.525, - "completions/min_length": 36.1, - "completions/min_terminated_length": 36.1, - "epoch": 0.004061049084142512, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.2767515182495117, - "kl": 0.4977882787585258, - "learning_rate": 4.208184434201999e-06, - "loss": 0.0, - "num_tokens": 882460.0, - "reward": 3.6590237617492676, - "reward_std": 0.1534363637678325, - "rewards/coherence_reward_func/mean": 0.9400000035762787, - "rewards/coherence_reward_func/std": 0.03464101552963257, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2190237551927567, - "rewards/quality_reward_func/std": 0.12440916821360588, - "step": 670 - }, - { - "completion_length": 45.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 45.5, - "completions/max_terminated_length": 45.5, - "completions/mean_length": 38.325, - "completions/mean_terminated_length": 38.325, - "completions/min_length": 34.0, - "completions/min_terminated_length": 34.0, - "epoch": 0.004121661757040162, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.22527770698070526, - "kl": 0.4683365380391479, - "learning_rate": 4.176066547396998e-06, - "loss": 0.0, - "num_tokens": 895229.0, - "reward": 3.665366506576538, - "reward_std": 0.1380756115424447, - "rewards/coherence_reward_func/mean": 0.9300000011920929, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2353664577007293, - "rewards/quality_reward_func/std": 0.1380756231606938, - "step": 680 - }, - { - "completion_length": 37.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 37.9, - "completions/max_terminated_length": 37.9, - "completions/mean_length": 35.625, - "completions/mean_terminated_length": 35.625, - "completions/min_length": 34.4, - "completions/min_terminated_length": 34.4, - "epoch": 0.004182274429937812, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.2846224308013916, - "kl": 0.6151923153549432, - "learning_rate": 4.14343811448524e-06, - "loss": 0.0, - "num_tokens": 907834.0, - "reward": 3.822063159942627, - "reward_std": 0.10759536074474454, - "rewards/coherence_reward_func/mean": 0.9850000023841858, - "rewards/coherence_reward_func/std": 0.017320507764816286, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3370631456375122, - "rewards/quality_reward_func/std": 0.12491586001124233, - "step": 690 - }, - { - "completion_length": 35.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 35.1, - "completions/max_terminated_length": 35.1, - "completions/mean_length": 33.85, - "completions/mean_terminated_length": 33.85, - "completions/min_length": 31.8, - "completions/min_terminated_length": 31.8, - "epoch": 0.0042428871028354605, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.1332438737154007, - "kl": 0.7396882228553295, - "learning_rate": 4.110309074403467e-06, - "loss": 0.0001, - "num_tokens": 924048.0, - "reward": 3.6296366691589355, - "reward_std": 0.09745681836502626, - "rewards/coherence_reward_func/mean": 0.9850000023841858, - "rewards/coherence_reward_func/std": 0.017320507764816286, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.1446366012096405, - "rewards/quality_reward_func/std": 0.08014893924118952, - "step": 700 - }, - { - "completion_length": 39.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.9, - "completions/max_terminated_length": 39.9, - "completions/mean_length": 37.6, - "completions/mean_terminated_length": 37.6, - "completions/min_length": 35.2, - "completions/min_terminated_length": 35.2, - "epoch": 0.00430349977573311, - "frac_reward_zero_std": 0.0, - "grad_norm": 0.30046993494033813, - "kl": 0.6662097468972206, - "learning_rate": 4.076689518578217e-06, - "loss": 0.0, - "num_tokens": 938532.0, - "reward": 3.9227866411209105, - "reward_std": 0.14562590671703218, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4227866888046266, - "rewards/quality_reward_func/std": 0.14562592869624497, - "step": 710 - }, - { - "completion_length": 38.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 38.6, - "completions/max_terminated_length": 38.6, - "completions/mean_length": 35.675, - "completions/mean_terminated_length": 35.675, - "completions/min_length": 33.4, - "completions/min_terminated_length": 33.4, - "epoch": 0.00436411244863076, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.21189920604228973, - "kl": 0.785119522549212, - "learning_rate": 4.0425896878518725e-06, - "loss": 0.0, - "num_tokens": 948791.0, - "reward": 3.64911150932312, - "reward_std": 0.0931307939812541, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.4875, - "rewards/formatting_reward_func/std": 0.025, - "rewards/quality_reward_func/mean": 1.1916115328669548, - "rewards/quality_reward_func/std": 0.0681307876482606, - "step": 720 - }, - { - "completion_length": 30.3, - "completions/clipped_ratio": 0.0, - "completions/max_length": 30.3, - "completions/max_terminated_length": 30.3, - "completions/mean_length": 28.925, - "completions/mean_terminated_length": 28.925, - "completions/min_length": 27.6, - "completions/min_terminated_length": 27.6, - "epoch": 0.004424725121528409, - "frac_reward_zero_std": 0.5, - "grad_norm": 0.0, - "kl": 0.894575348496437, - "learning_rate": 4.008019969363206e-06, - "loss": 0.0001, - "num_tokens": 964356.0, - "reward": 3.4800790786743163, - "reward_std": 0.06922141835093498, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.0100791096687316, - "rewards/quality_reward_func/std": 0.06922140950337052, - "step": 730 - }, - { - "completion_length": 32.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 32.2, - "completions/max_terminated_length": 32.2, - "completions/mean_length": 30.5, - "completions/mean_terminated_length": 30.5, - "completions/min_length": 28.6, - "completions/min_terminated_length": 28.6, - "epoch": 0.004485337794426059, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.1204589381814003, - "kl": 0.7596054591238499, - "learning_rate": 3.972990893383356e-06, - "loss": 0.0, - "num_tokens": 976792.0, - "reward": 3.579333543777466, - "reward_std": 0.14822095707058908, - "rewards/coherence_reward_func/mean": 0.9824999988079071, - "rewards/coherence_reward_func/std": 0.034999999403953555, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.0968336045742035, - "rewards/quality_reward_func/std": 0.1199650514870882, - "step": 740 - }, - { - "completion_length": 39.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.9, - "completions/max_terminated_length": 39.9, - "completions/mean_length": 36.9, - "completions/mean_terminated_length": 36.9, - "completions/min_length": 33.0, - "completions/min_terminated_length": 33.0, - "epoch": 0.004545950467323708, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.15538397431373596, - "kl": 0.7892457515932619, - "learning_rate": 3.9375131301081974e-06, - "loss": 0.0001, - "num_tokens": 988452.0, - "reward": 3.878334403038025, - "reward_std": 0.2350888043642044, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3783344507217408, - "rewards/quality_reward_func/std": 0.2350887943059206, - "step": 750 - }, - { - "completion_length": 35.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 35.0, - "completions/max_terminated_length": 35.0, - "completions/mean_length": 32.5, - "completions/mean_terminated_length": 32.5, - "completions/min_length": 30.7, - "completions/min_terminated_length": 30.7, - "epoch": 0.004606563140221357, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.24180291593074799, - "kl": 0.9362801257520914, - "learning_rate": 3.901597486408105e-06, - "loss": 0.0001, - "num_tokens": 1004208.0, - "reward": 3.828435182571411, - "reward_std": 0.07007699441164732, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.328435128927231, - "rewards/quality_reward_func/std": 0.0700769692659378, - "step": 760 - }, - { - "completion_length": 37.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 37.6, - "completions/max_terminated_length": 37.6, - "completions/mean_length": 35.475, - "completions/mean_terminated_length": 35.475, - "completions/min_length": 34.3, - "completions/min_terminated_length": 34.3, - "epoch": 0.004667175813119007, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.1852080076932907, - "kl": 0.9107007831335068, - "learning_rate": 3.865254902536073e-06, - "loss": 0.0001, - "num_tokens": 1017951.0, - "reward": 3.988982844352722, - "reward_std": 0.12737804618664086, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.5189830183982849, - "rewards/quality_reward_func/std": 0.12737804912030698, - "step": 770 - }, - { - "completion_length": 44.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 44.6, - "completions/max_terminated_length": 44.6, - "completions/mean_length": 40.175, - "completions/mean_terminated_length": 40.175, - "completions/min_length": 34.6, - "completions/min_terminated_length": 34.6, - "epoch": 0.004727788486016657, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.2061929851770401, - "kl": 0.76973906904459, - "learning_rate": 3.828496448795208e-06, - "loss": 0.0001, - "num_tokens": 1030462.0, - "reward": 3.652807593345642, - "reward_std": 0.31089982322300785, - "rewards/coherence_reward_func/mean": 0.9850000023841858, - "rewards/coherence_reward_func/std": 0.017320507764816286, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.167807585000992, - "rewards/quality_reward_func/std": 0.29357928061290295, - "step": 780 - }, - { - "completion_length": 35.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 35.0, - "completions/max_terminated_length": 35.0, - "completions/mean_length": 33.2, - "completions/mean_terminated_length": 33.2, - "completions/min_length": 32.1, - "completions/min_terminated_length": 32.1, - "epoch": 0.004788401158914306, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.13467849791049957, - "kl": 0.7465240528807044, - "learning_rate": 3.791333322166605e-06, - "loss": 0.0, - "num_tokens": 1046838.0, - "reward": 3.8106732606887816, - "reward_std": 0.10934643987566232, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3106732189655304, - "rewards/quality_reward_func/std": 0.10934643484652043, - "step": 790 - }, - { - "completion_length": 32.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 32.5, - "completions/max_terminated_length": 32.5, - "completions/mean_length": 31.425, - "completions/mean_terminated_length": 31.425, - "completions/min_length": 29.4, - "completions/min_terminated_length": 29.4, - "epoch": 0.004849013831811955, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.0, - "kl": 0.7720465335994959, - "learning_rate": 3.753776842898644e-06, - "loss": 0.0, - "num_tokens": 1058895.0, - "reward": 3.6502453088760376, - "reward_std": 0.15514872260391713, - "rewards/coherence_reward_func/mean": 0.9824999988079071, - "rewards/coherence_reward_func/std": 0.034999999403953555, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.1677452743053436, - "rewards/quality_reward_func/std": 0.12014871742576361, - "step": 800 - }, - { - "completion_length": 35.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 35.1, - "completions/max_terminated_length": 35.1, - "completions/mean_length": 33.1, - "completions/mean_terminated_length": 33.1, - "completions/min_length": 31.2, - "completions/min_terminated_length": 31.2, - "epoch": 0.004909626504709605, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.21491074562072754, - "kl": 1.1367528922855854, - "learning_rate": 3.7158384510587264e-06, - "loss": 0.0001, - "num_tokens": 1070043.0, - "reward": 3.6540565967559813, - "reward_std": 0.15607355162501335, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.1540565967559815, - "rewards/quality_reward_func/std": 0.15607354342937468, - "step": 810 - }, - { - "completion_length": 32.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 32.6, - "completions/max_terminated_length": 32.6, - "completions/mean_length": 30.85, - "completions/mean_terminated_length": 30.85, - "completions/min_length": 28.5, - "completions/min_terminated_length": 28.5, - "epoch": 0.004970239177607254, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.25827139616012573, - "kl": 0.8578541923314333, - "learning_rate": 3.677529703048525e-06, - "loss": 0.0001, - "num_tokens": 1085957.0, - "reward": 3.5395549058914186, - "reward_std": 0.15808383515104651, - "rewards/coherence_reward_func/mean": 0.9174999982118607, - "rewards/coherence_reward_func/std": 0.034999999403953555, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.1220549523830414, - "rewards/quality_reward_func/std": 0.12323397975414992, - "step": 820 - }, - { - "completion_length": 34.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 34.6, - "completions/max_terminated_length": 34.6, - "completions/mean_length": 31.275, - "completions/mean_terminated_length": 31.275, - "completions/min_length": 28.7, - "completions/min_terminated_length": 28.7, - "epoch": 0.005030851850504904, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.0, - "kl": 0.9296011418104172, - "learning_rate": 3.6388622680837893e-06, - "loss": 0.0001, - "num_tokens": 1101220.0, - "reward": 3.607322669029236, - "reward_std": 0.2502354118973017, - "rewards/coherence_reward_func/mean": 0.925, - "rewards/coherence_reward_func/std": 0.05, - "rewards/formatting_reward_func/mean": 1.4625, - "rewards/formatting_reward_func/std": 0.025, - "rewards/quality_reward_func/mean": 1.2198225945234298, - "rewards/quality_reward_func/std": 0.1752354133874178, - "step": 830 - }, - { - "completion_length": 34.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 34.1, - "completions/max_terminated_length": 34.1, - "completions/mean_length": 33.025, - "completions/mean_terminated_length": 33.025, - "completions/min_length": 32.0, - "completions/min_terminated_length": 32.0, - "epoch": 0.005091464523402553, - "frac_reward_zero_std": 0.5, - "grad_norm": 0.0, - "kl": 0.700504607334733, - "learning_rate": 3.599847924639788e-06, - "loss": 0.0, - "num_tokens": 1115853.0, - "reward": 3.888926315307617, - "reward_std": 0.05376259086187929, - "rewards/coherence_reward_func/mean": 0.9300000011920929, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4589263439178466, - "rewards/quality_reward_func/std": 0.05376259498298168, - "step": 840 - }, - { - "completion_length": 35.7, - "completions/clipped_ratio": 0.0, - "completions/max_length": 35.7, - "completions/max_terminated_length": 35.7, - "completions/mean_length": 32.525, - "completions/mean_terminated_length": 32.525, - "completions/min_length": 29.1, - "completions/min_terminated_length": 29.1, - "epoch": 0.005152077196300202, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.3158005475997925, - "kl": 0.9616607185453176, - "learning_rate": 3.5604985568634754e-06, - "loss": 0.0001, - "num_tokens": 1127642.0, - "reward": 3.815172529220581, - "reward_std": 0.16472034882754089, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3151724725961684, - "rewards/quality_reward_func/std": 0.16472036391496658, - "step": 850 - }, - { - "completion_length": 34.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 34.1, - "completions/max_terminated_length": 34.1, - "completions/mean_length": 31.975, - "completions/mean_terminated_length": 31.975, - "completions/min_length": 29.7, - "completions/min_terminated_length": 29.7, - "epoch": 0.005212689869197852, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.17151622474193573, - "kl": 1.0605566695332527, - "learning_rate": 3.5208261509534627e-06, - "loss": 0.0001, - "num_tokens": 1140577.0, - "reward": 3.7323100566864014, - "reward_std": 0.05832284335047007, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2323099792003631, - "rewards/quality_reward_func/std": 0.058322815038263795, - "step": 860 - }, - { - "completion_length": 33.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 33.2, - "completions/max_terminated_length": 33.2, - "completions/mean_length": 32.25, - "completions/mean_terminated_length": 32.25, - "completions/min_length": 31.3, - "completions/min_terminated_length": 31.3, - "epoch": 0.005273302542095501, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.0, - "kl": 0.915602894872427, - "learning_rate": 3.480842791508904e-06, - "loss": 0.0001, - "num_tokens": 1152707.0, - "reward": 3.932111144065857, - "reward_std": 0.08545712381601334, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4321111202239991, - "rewards/quality_reward_func/std": 0.08545711189508438, - "step": 870 - }, - { - "completion_length": 32.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 32.0, - "completions/max_terminated_length": 32.0, - "completions/mean_length": 30.15, - "completions/mean_terminated_length": 30.15, - "completions/min_length": 28.5, - "completions/min_terminated_length": 28.5, - "epoch": 0.005333915214993151, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.22786682844161987, - "kl": 0.8655254432931543, - "learning_rate": 3.440560657848414e-06, - "loss": 0.0001, - "num_tokens": 1163557.0, - "reward": 3.6994583368301392, - "reward_std": 0.15054690847173333, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2294583201408387, - "rewards/quality_reward_func/std": 0.15054690733086318, - "step": 880 - }, - { - "completion_length": 36.7, - "completions/clipped_ratio": 0.0, - "completions/max_length": 36.7, - "completions/max_terminated_length": 36.7, - "completions/mean_length": 35.375, - "completions/mean_terminated_length": 35.375, - "completions/min_length": 34.6, - "completions/min_terminated_length": 34.6, - "epoch": 0.0053945278878908005, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.38637974858283997, - "kl": 0.8875150509178639, - "learning_rate": 3.3999920203001287e-06, - "loss": 0.0001, - "num_tokens": 1174868.0, - "reward": 3.6290750026702883, - "reward_std": 0.1874656138010323, - "rewards/coherence_reward_func/mean": 0.9524999976158142, - "rewards/coherence_reward_func/std": 0.034999999403953555, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.1765749841928481, - "rewards/quality_reward_func/std": 0.1524656143039465, - "step": 890 - }, - { - "completion_length": 42.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 42.8, - "completions/max_terminated_length": 42.8, - "completions/mean_length": 39.975, - "completions/mean_terminated_length": 39.975, - "completions/min_length": 36.2, - "completions/min_terminated_length": 36.2, - "epoch": 0.005455140560788449, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.08063001930713654, - "kl": 0.5390862679574638, - "learning_rate": 3.359149236464041e-06, - "loss": 0.0, - "num_tokens": 1186207.0, - "reward": 3.800839424133301, - "reward_std": 0.176435112580657, - "rewards/coherence_reward_func/mean": 0.9925000011920929, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.4875, - "rewards/formatting_reward_func/std": 0.025, - "rewards/quality_reward_func/mean": 1.3208393454551697, - "rewards/quality_reward_func/std": 0.13643510080873966, - "step": 900 - }, - { - "completion_length": 36.3, - "completions/clipped_ratio": 0.0, - "completions/max_length": 36.3, - "completions/max_terminated_length": 36.3, - "completions/mean_length": 35.375, - "completions/mean_terminated_length": 35.375, - "completions/min_length": 33.8, - "completions/min_terminated_length": 33.8, - "epoch": 0.005515753233686099, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.1309899240732193, - "kl": 0.7234176866710186, - "learning_rate": 3.31804474744776e-06, - "loss": 0.0, - "num_tokens": 1200946.0, - "reward": 3.799390172958374, - "reward_std": 0.10073573589324951, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3293901562690735, - "rewards/quality_reward_func/std": 0.10073572732508182, - "step": 910 - }, - { - "completion_length": 45.3, - "completions/clipped_ratio": 0.0, - "completions/max_length": 45.3, - "completions/max_terminated_length": 45.3, - "completions/mean_length": 42.125, - "completions/mean_terminated_length": 42.125, - "completions/min_length": 39.6, - "completions/min_terminated_length": 39.6, - "epoch": 0.005576365906583749, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.0, - "kl": 0.6035341188311577, - "learning_rate": 3.27669107407683e-06, - "loss": 0.0, - "num_tokens": 1213827.0, - "reward": 3.8284897327423097, - "reward_std": 0.1031534805893898, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3284897089004517, - "rewards/quality_reward_func/std": 0.10315347984433174, - "step": 920 - }, - { - "completion_length": 41.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 41.8, - "completions/max_terminated_length": 41.8, - "completions/mean_length": 38.7, - "completions/mean_terminated_length": 38.7, - "completions/min_length": 35.9, - "completions/min_terminated_length": 35.9, - "epoch": 0.005636978579481398, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.0, - "kl": 0.6392061043530702, - "learning_rate": 3.23510081308076e-06, - "loss": 0.0001, - "num_tokens": 1228195.0, - "reward": 3.8545461177825926, - "reward_std": 0.14761899150907992, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3845462083816529, - "rewards/quality_reward_func/std": 0.14761900324374438, - "step": 930 - }, - { - "completion_length": 41.7, - "completions/clipped_ratio": 0.0, - "completions/max_length": 41.7, - "completions/max_terminated_length": 41.7, - "completions/mean_length": 38.775, - "completions/mean_terminated_length": 38.775, - "completions/min_length": 36.2, - "completions/min_terminated_length": 36.2, - "epoch": 0.0056975912523790475, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.2682724595069885, - "kl": 0.6088701412081718, - "learning_rate": 3.1932866332559455e-06, - "loss": 0.0, - "num_tokens": 1241286.0, - "reward": 3.984172892570496, - "reward_std": 0.11729427184909583, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4841729283332825, - "rewards/quality_reward_func/std": 0.11729424688965082, - "step": 940 - }, - { - "completion_length": 41.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 41.6, - "completions/max_terminated_length": 41.6, - "completions/mean_length": 39.275, - "completions/mean_terminated_length": 39.275, - "completions/min_length": 37.3, - "completions/min_terminated_length": 37.3, - "epoch": 0.005758203925276697, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.2781446576118469, - "kl": 0.5564841218292713, - "learning_rate": 3.1512612716066217e-06, - "loss": 0.0, - "num_tokens": 1253945.0, - "reward": 3.909769296646118, - "reward_std": 0.0962110199034214, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4397692739963532, - "rewards/quality_reward_func/std": 0.09621100360527635, - "step": 950 - }, - { - "completion_length": 43.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 43.5, - "completions/max_terminated_length": 43.5, - "completions/mean_length": 39.55, - "completions/mean_terminated_length": 39.55, - "completions/min_length": 35.9, - "completions/min_terminated_length": 35.9, - "epoch": 0.005818816598174346, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.0, - "kl": 0.6215634207590484, - "learning_rate": 3.1090375294650565e-06, - "loss": 0.0, - "num_tokens": 1265727.0, - "reward": 4.014461946487427, - "reward_std": 0.09971704296767711, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.544461864233017, - "rewards/quality_reward_func/std": 0.09971703067421914, - "step": 960 - }, - { - "completion_length": 39.4, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.4, - "completions/max_terminated_length": 39.4, - "completions/mean_length": 37.675, - "completions/mean_terminated_length": 37.675, - "completions/min_length": 36.1, - "completions/min_terminated_length": 36.1, - "epoch": 0.005879429271071996, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.47595250606536865, - "kl": 0.7499655776191503, - "learning_rate": 3.066628268592138e-06, - "loss": 0.0001, - "num_tokens": 1281362.0, - "reward": 3.853908157348633, - "reward_std": 0.04199212994426489, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3539081931114196, - "rewards/quality_reward_func/std": 0.04199216105625965, - "step": 970 - }, - { - "completion_length": 41.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 41.5, - "completions/max_terminated_length": 41.5, - "completions/mean_length": 38.9, - "completions/mean_terminated_length": 38.9, - "completions/min_length": 36.8, - "completions/min_terminated_length": 36.8, - "epoch": 0.005940041943969646, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.17277319729328156, - "kl": 0.5382255587726832, - "learning_rate": 3.0240464072595547e-06, - "loss": 0.0, - "num_tokens": 1293686.0, - "reward": 3.7531413316726683, - "reward_std": 0.09528634613379836, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2531413912773133, - "rewards/quality_reward_func/std": 0.09528628345578909, - "step": 980 - }, - { - "completion_length": 35.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 35.9, - "completions/max_terminated_length": 35.9, - "completions/mean_length": 32.65, - "completions/mean_terminated_length": 32.65, - "completions/min_length": 29.7, - "completions/min_terminated_length": 29.7, - "epoch": 0.0060006546168672945, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.3110400140285492, - "kl": 0.7709437053650617, - "learning_rate": 2.981304916314769e-06, - "loss": 0.0001, - "num_tokens": 1303592.0, - "reward": 3.9956079721450806, - "reward_std": 0.06759364753961564, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.495608001947403, - "rewards/quality_reward_func/std": 0.06759367641061545, - "step": 990 - }, - { - "completion_length": 38.4, - "completions/clipped_ratio": 0.0, - "completions/max_length": 38.4, - "completions/max_terminated_length": 38.4, - "completions/mean_length": 35.15, - "completions/mean_terminated_length": 35.15, - "completions/min_length": 31.8, - "completions/min_terminated_length": 31.8, - "epoch": 0.006061267289764944, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.32638901472091675, - "kl": 0.7601682379841804, - "learning_rate": 2.938416815229968e-06, - "loss": 0.0001, - "num_tokens": 1318422.0, - "reward": 3.9276065587997437, - "reward_std": 0.18274470856413244, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4276066362857818, - "rewards/quality_reward_func/std": 0.1827447606716305, - "step": 1000 - }, - { - "completion_length": 34.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 34.8, - "completions/max_terminated_length": 34.8, - "completions/mean_length": 33.4, - "completions/mean_terminated_length": 33.4, - "completions/min_length": 32.4, - "completions/min_terminated_length": 32.4, - "epoch": 0.006121879962662593, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.0, - "kl": 0.7482025191187859, - "learning_rate": 2.8953951681362098e-06, - "loss": 0.0, - "num_tokens": 1331878.0, - "reward": 3.7783411264419557, - "reward_std": 0.06739873860497028, - "rewards/coherence_reward_func/mean": 0.9400000005960465, - "rewards/coherence_reward_func/std": 0.019999998807907104, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3383410692214965, - "rewards/quality_reward_func/std": 0.050962546234950425, - "step": 1010 - }, - { - "completion_length": 39.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.5, - "completions/max_terminated_length": 39.5, - "completions/mean_length": 35.85, - "completions/mean_terminated_length": 35.85, - "completions/min_length": 33.0, - "completions/min_terminated_length": 33.0, - "epoch": 0.006182492635560243, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.1603815257549286, - "kl": 0.5867549292743206, - "learning_rate": 2.852253079843957e-06, - "loss": 0.0, - "num_tokens": 1345044.0, - "reward": 3.9886896133422853, - "reward_std": 0.16920062620192766, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.5186896204948426, - "rewards/quality_reward_func/std": 0.16920066829770802, - "step": 1020 - }, - { - "completion_length": 33.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 33.5, - "completions/max_terminated_length": 33.5, - "completions/mean_length": 32.25, - "completions/mean_terminated_length": 32.25, - "completions/min_length": 30.7, - "completions/min_terminated_length": 30.7, - "epoch": 0.006243105308457893, - "frac_reward_zero_std": 0.5, - "grad_norm": 0.0, - "kl": 0.6418775924947113, - "learning_rate": 2.809003691851232e-06, - "loss": 0.0, - "num_tokens": 1356874.0, - "reward": 3.8942331075668335, - "reward_std": 0.0729768855497241, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4242331296205522, - "rewards/quality_reward_func/std": 0.07297688759863377, - "step": 1030 - }, - { - "completion_length": 39.3, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.3, - "completions/max_terminated_length": 39.3, - "completions/mean_length": 36.875, - "completions/mean_terminated_length": 36.875, - "completions/min_length": 33.1, - "completions/min_terminated_length": 33.1, - "epoch": 0.0063037179813555414, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.319623738527298, - "kl": 0.8076464911922813, - "learning_rate": 2.7656601783405833e-06, - "loss": 0.0001, - "num_tokens": 1371601.0, - "reward": 4.063040947914123, - "reward_std": 0.07689689043909312, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.5630410730838775, - "rewards/quality_reward_func/std": 0.07689687758684158, - "step": 1040 - }, - { - "completion_length": 39.7, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.7, - "completions/max_terminated_length": 39.7, - "completions/mean_length": 35.15, - "completions/mean_terminated_length": 35.15, - "completions/min_length": 31.8, - "completions/min_terminated_length": 31.8, - "epoch": 0.006364330654253191, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.5481071472167969, - "kl": 0.8138475578278304, - "learning_rate": 2.7222357421661042e-06, - "loss": 0.0001, - "num_tokens": 1384083.0, - "reward": 3.9324022054672243, - "reward_std": 0.12546173660084606, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.43240225315094, - "rewards/quality_reward_func/std": 0.12546178670600056, - "step": 1050 - }, - { - "completion_length": 31.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 31.1, - "completions/max_terminated_length": 31.1, - "completions/mean_length": 30.075, - "completions/mean_terminated_length": 30.075, - "completions/min_length": 28.7, - "completions/min_terminated_length": 28.7, - "epoch": 0.006424943327150841, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.11690180003643036, - "kl": 0.8220470007508993, - "learning_rate": 2.678743610831715e-06, - "loss": 0.0, - "num_tokens": 1395578.0, - "reward": 3.8780935287475584, - "reward_std": 0.07352278865873814, - "rewards/coherence_reward_func/mean": 0.9600000023841858, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4180934369564056, - "rewards/quality_reward_func/std": 0.07352277263998985, - "step": 1060 - }, - { - "completion_length": 34.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 34.2, - "completions/max_terminated_length": 34.2, - "completions/mean_length": 33.2, - "completions/mean_terminated_length": 33.2, - "completions/min_length": 31.8, - "completions/min_terminated_length": 31.8, - "epoch": 0.00648555600004849, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.13903403282165527, - "kl": 0.8520046532154083, - "learning_rate": 2.635197032461939e-06, - "loss": 0.0001, - "num_tokens": 1411434.0, - "reward": 3.925128865242004, - "reward_std": 0.21941234022378922, - "rewards/coherence_reward_func/mean": 0.8925000071525574, - "rewards/coherence_reward_func/std": 0.03500000238418579, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.532628881931305, - "rewards/quality_reward_func/std": 0.18441240545362234, - "step": 1070 - }, - { - "completion_length": 40.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 40.5, - "completions/max_terminated_length": 40.5, - "completions/mean_length": 35.925, - "completions/mean_terminated_length": 35.925, - "completions/min_length": 31.4, - "completions/min_terminated_length": 31.4, - "epoch": 0.0065461686729461396, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.3517349660396576, - "kl": 0.7328827120363712, - "learning_rate": 2.591609271766391e-06, - "loss": 0.0001, - "num_tokens": 1426995.0, - "reward": 3.5588266372680666, - "reward_std": 0.1551475243177265, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.05882670879364, - "rewards/quality_reward_func/std": 0.1551475458778441, - "step": 1080 - }, - { - "completion_length": 44.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 44.5, - "completions/max_terminated_length": 44.5, - "completions/mean_length": 38.775, - "completions/mean_terminated_length": 38.775, - "completions/min_length": 33.6, - "completions/min_terminated_length": 33.6, - "epoch": 0.006606781345843789, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.11026515811681747, - "kl": 0.5106275354512035, - "learning_rate": 2.547993605999225e-06, - "loss": 0.0, - "num_tokens": 1440410.0, - "reward": 3.989608359336853, - "reward_std": 0.0880632999818772, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.489608359336853, - "rewards/quality_reward_func/std": 0.0880632677813992, - "step": 1090 - }, - { - "completion_length": 38.4, - "completions/clipped_ratio": 0.0, - "completions/max_length": 38.4, - "completions/max_terminated_length": 38.4, - "completions/mean_length": 35.325, - "completions/mean_terminated_length": 35.325, - "completions/min_length": 32.3, - "completions/min_terminated_length": 32.3, - "epoch": 0.006667394018741438, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.0907859206199646, - "kl": 0.6644573543220759, - "learning_rate": 2.504363320914746e-06, - "loss": 0.0, - "num_tokens": 1453323.0, - "reward": 4.11377637386322, - "reward_std": 0.09255220675840974, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.6137763619422913, - "rewards/quality_reward_func/std": 0.0925522580742836, - "step": 1100 - }, - { - "completion_length": 35.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 35.1, - "completions/max_terminated_length": 35.1, - "completions/mean_length": 33.6, - "completions/mean_terminated_length": 33.6, - "completions/min_length": 32.5, - "completions/min_terminated_length": 32.5, - "epoch": 0.006728006691639088, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.0, - "kl": 0.8078338511288166, - "learning_rate": 2.460731706720449e-06, - "loss": 0.0001, - "num_tokens": 1466455.0, - "reward": 3.923912596702576, - "reward_std": 0.04916523024439812, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.453912580013275, - "rewards/quality_reward_func/std": 0.04916525389999151, - "step": 1110 - }, - { - "completion_length": 38.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 38.8, - "completions/max_terminated_length": 38.8, - "completions/mean_length": 35.225, - "completions/mean_terminated_length": 35.225, - "completions/min_length": 31.1, - "completions/min_terminated_length": 31.1, - "epoch": 0.006788619364536738, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.0, - "kl": 0.8176405504345894, - "learning_rate": 2.4171120540286848e-06, - "loss": 0.0001, - "num_tokens": 1481596.0, - "reward": 3.590487003326416, - "reward_std": 0.30801473204046487, - "rewards/coherence_reward_func/mean": 0.9350000023841858, - "rewards/coherence_reward_func/std": 0.05, - "rewards/formatting_reward_func/mean": 1.4625, - "rewards/formatting_reward_func/std": 0.075, - "rewards/quality_reward_func/mean": 1.1929870069026947, - "rewards/quality_reward_func/std": 0.18301476575434208, - "step": 1120 - }, - { - "completion_length": 38.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 38.8, - "completions/max_terminated_length": 38.8, - "completions/mean_length": 35.95, - "completions/mean_terminated_length": 35.95, - "completions/min_length": 33.7, - "completions/min_terminated_length": 33.7, - "epoch": 0.0068492320374343866, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.2624747157096863, - "kl": 0.6980829928070307, - "learning_rate": 2.373517649808217e-06, - "loss": 0.0001, - "num_tokens": 1495434.0, - "reward": 4.031958103179932, - "reward_std": 0.05438796263188124, - "rewards/coherence_reward_func/mean": 0.9625000059604645, - "rewards/coherence_reward_func/std": 0.03500000238418579, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.5694581747055054, - "rewards/quality_reward_func/std": 0.03075642976909876, - "step": 1130 - }, - { - "completion_length": 32.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 32.1, - "completions/max_terminated_length": 32.1, - "completions/mean_length": 31.075, - "completions/mean_terminated_length": 31.075, - "completions/min_length": 29.6, - "completions/min_terminated_length": 29.6, - "epoch": 0.006909844710332036, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.2690404951572418, - "kl": 0.6380854770541191, - "learning_rate": 2.3299617733368805e-06, - "loss": 0.0, - "num_tokens": 1507609.0, - "reward": 3.964665150642395, - "reward_std": 0.09063520752824843, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.464665138721466, - "rewards/quality_reward_func/std": 0.09063521791249514, - "step": 1140 - }, - { - "completion_length": 39.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.8, - "completions/max_terminated_length": 39.8, - "completions/mean_length": 34.225, - "completions/mean_terminated_length": 34.225, - "completions/min_length": 30.8, - "completions/min_terminated_length": 30.8, - "epoch": 0.006970457383229686, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.0, - "kl": 0.7512777108699084, - "learning_rate": 2.2864576921565816e-06, - "loss": 0.0001, - "num_tokens": 1522458.0, - "reward": 3.738456439971924, - "reward_std": 0.03931144876405597, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2384564340114594, - "rewards/quality_reward_func/std": 0.03931143744848668, - "step": 1150 - }, - { - "completion_length": 37.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 37.8, - "completions/max_terminated_length": 37.8, - "completions/mean_length": 34.8, - "completions/mean_terminated_length": 34.8, - "completions/min_length": 32.7, - "completions/min_terminated_length": 32.7, - "epoch": 0.007031070056127335, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.28654745221138, - "kl": 0.5811090076342225, - "learning_rate": 2.2430186580318833e-06, - "loss": 0.0, - "num_tokens": 1534546.0, - "reward": 3.851572132110596, - "reward_std": 0.19850474322447553, - "rewards/coherence_reward_func/mean": 0.9675000011920929, - "rewards/coherence_reward_func/std": 0.052320507168769834, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3840721726417542, - "rewards/quality_reward_func/std": 0.14700013548135757, - "step": 1160 - }, - { - "completion_length": 35.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 35.6, - "completions/max_terminated_length": 35.6, - "completions/mean_length": 32.175, - "completions/mean_terminated_length": 32.175, - "completions/min_length": 29.6, - "completions/min_terminated_length": 29.6, - "epoch": 0.007091682729024985, - "frac_reward_zero_std": 0.0, - "grad_norm": 0.27025097608566284, - "kl": 0.8093738172203302, - "learning_rate": 2.1996579029133826e-06, - "loss": 0.0001, - "num_tokens": 1548969.0, - "reward": 3.7685044527053835, - "reward_std": 0.1714465655386448, - "rewards/coherence_reward_func/mean": 0.9850000023841858, - "rewards/coherence_reward_func/std": 0.017320507764816286, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2835045337677002, - "rewards/quality_reward_func/std": 0.1541260455735028, - "step": 1170 - }, - { - "completion_length": 40.7, - "completions/clipped_ratio": 0.0, - "completions/max_length": 40.7, - "completions/max_terminated_length": 40.7, - "completions/mean_length": 36.625, - "completions/mean_terminated_length": 36.625, - "completions/min_length": 33.7, - "completions/min_terminated_length": 33.7, - "epoch": 0.007152295401922634, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.0, - "kl": 0.5391599021852016, - "learning_rate": 2.156388634907134e-06, - "loss": 0.0, - "num_tokens": 1560742.0, - "reward": 3.805975413322449, - "reward_std": 0.07460486870259046, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3059754014015197, - "rewards/quality_reward_func/std": 0.07460487484931946, - "step": 1180 - }, - { - "completion_length": 35.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 35.0, - "completions/max_terminated_length": 35.0, - "completions/mean_length": 31.45, - "completions/mean_terminated_length": 31.45, - "completions/min_length": 28.9, - "completions/min_terminated_length": 28.9, - "epoch": 0.007212908074820283, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.18683335185050964, - "kl": 0.537571616191417, - "learning_rate": 2.1132240342513304e-06, - "loss": 0.0, - "num_tokens": 1572608.0, - "reward": 3.5767133951187136, - "reward_std": 0.12494921423494816, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.0767134875059128, - "rewards/quality_reward_func/std": 0.12494920939207077, - "step": 1190 - }, - { - "completion_length": 35.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 35.9, - "completions/max_terminated_length": 35.9, - "completions/mean_length": 33.25, - "completions/mean_terminated_length": 33.25, - "completions/min_length": 30.5, - "completions/min_terminated_length": 30.5, - "epoch": 0.007273520747717933, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.0, - "kl": 0.7194669954478741, - "learning_rate": 2.070177249301476e-06, - "loss": 0.0, - "num_tokens": 1585502.0, - "reward": 3.809039855003357, - "reward_std": 0.10976020721718668, - "rewards/coherence_reward_func/mean": 0.9150000035762786, - "rewards/coherence_reward_func/std": 0.017320507764816286, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3940398573875428, - "rewards/quality_reward_func/std": 0.11074666082859039, - "step": 1200 - }, - { - "completion_length": 34.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 34.6, - "completions/max_terminated_length": 34.6, - "completions/mean_length": 32.2, - "completions/mean_terminated_length": 32.2, - "completions/min_length": 29.9, - "completions/min_terminated_length": 29.9, - "epoch": 0.007334133420615582, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.2777557969093323, - "kl": 0.682066448405385, - "learning_rate": 2.0272613925252716e-06, - "loss": 0.0, - "num_tokens": 1596782.0, - "reward": 3.4940081596374513, - "reward_std": 0.07237992389127612, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 0.9940082013607026, - "rewards/quality_reward_func/std": 0.07237992798909545, - "step": 1210 - }, - { - "completion_length": 42.3, - "completions/clipped_ratio": 0.0, - "completions/max_length": 42.3, - "completions/max_terminated_length": 42.3, - "completions/mean_length": 37.725, - "completions/mean_terminated_length": 37.725, - "completions/min_length": 33.9, - "completions/min_terminated_length": 33.9, - "epoch": 0.007394746093513232, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.0, - "kl": 0.49028753973543643, - "learning_rate": 1.9844895365084264e-06, - "loss": 0.0, - "num_tokens": 1607003.0, - "reward": 3.8880168437957763, - "reward_std": 0.11936564119532704, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.388016813993454, - "rewards/quality_reward_func/std": 0.11936561986804009, - "step": 1220 - }, - { - "completion_length": 37.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 37.8, - "completions/max_terminated_length": 37.8, - "completions/mean_length": 34.925, - "completions/mean_terminated_length": 34.925, - "completions/min_length": 33.3, - "completions/min_terminated_length": 33.3, - "epoch": 0.007455358766410881, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.17374633252620697, - "kl": 0.8674410484731198, - "learning_rate": 1.941874709972622e-06, - "loss": 0.0001, - "num_tokens": 1618968.0, - "reward": 4.1591418266296385, - "reward_std": 0.08483809668105095, - "rewards/coherence_reward_func/mean": 0.9774999976158142, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.6816418051719666, - "rewards/quality_reward_func/std": 0.06996260732412338, - "step": 1230 - }, - { - "completion_length": 32.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 32.1, - "completions/max_terminated_length": 32.1, - "completions/mean_length": 30.85, - "completions/mean_terminated_length": 30.85, - "completions/min_length": 29.7, - "completions/min_terminated_length": 29.7, - "epoch": 0.00751597143930853, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.0, - "kl": 0.6672137396410107, - "learning_rate": 1.899429893806841e-06, - "loss": 0.0, - "num_tokens": 1630122.0, - "reward": 3.6958040475845335, - "reward_std": 0.03936842924449593, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.225804054737091, - "rewards/quality_reward_func/std": 0.03936843371484429, - "step": 1240 - }, - { - "completion_length": 40.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 40.9, - "completions/max_terminated_length": 40.9, - "completions/mean_length": 39.375, - "completions/mean_terminated_length": 39.375, - "completions/min_length": 37.8, - "completions/min_terminated_length": 37.8, - "epoch": 0.00757658411220618, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.0, - "kl": 0.524336944334209, - "learning_rate": 1.8571680171132603e-06, - "loss": 0.0, - "num_tokens": 1642665.0, - "reward": 3.922765851020813, - "reward_std": 0.046882809279486536, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4227658241987229, - "rewards/quality_reward_func/std": 0.04688282259739936, - "step": 1250 - }, - { - "completion_length": 44.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 44.8, - "completions/max_terminated_length": 44.8, - "completions/mean_length": 41.65, - "completions/mean_terminated_length": 41.65, - "completions/min_length": 39.0, - "completions/min_terminated_length": 39.0, - "epoch": 0.00763719678510383, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.18260115385055542, - "kl": 0.6145955871790647, - "learning_rate": 1.815101953268919e-06, - "loss": 0.0001, - "num_tokens": 1659159.0, - "reward": 3.9243629217147826, - "reward_std": 0.0768905753735453, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.424362874031067, - "rewards/quality_reward_func/std": 0.07689053332433105, - "step": 1260 - }, - { - "completion_length": 36.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 36.6, - "completions/max_terminated_length": 36.6, - "completions/mean_length": 34.2, - "completions/mean_terminated_length": 34.2, - "completions/min_length": 32.4, - "completions/min_terminated_length": 32.4, - "epoch": 0.007697809458001479, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.4756428599357605, - "kl": 0.43017003908753393, - "learning_rate": 1.7732445160043687e-06, - "loss": 0.0, - "num_tokens": 1669975.0, - "reward": 3.738416600227356, - "reward_std": 0.11633943617343903, - "rewards/coherence_reward_func/mean": 0.9399999976158142, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.298416656255722, - "rewards/quality_reward_func/std": 0.11633945852518082, - "step": 1270 - }, - { - "completion_length": 42.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 42.1, - "completions/max_terminated_length": 42.1, - "completions/mean_length": 38.2, - "completions/mean_terminated_length": 38.2, - "completions/min_length": 33.9, - "completions/min_terminated_length": 33.9, - "epoch": 0.007758422130899128, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.0, - "kl": 0.5230879709124565, - "learning_rate": 1.7316084555004825e-06, - "loss": 0.0, - "num_tokens": 1682883.0, - "reward": 3.7134841203689577, - "reward_std": 0.08533983188681304, - "rewards/coherence_reward_func/mean": 0.9474999964237213, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2659839987754822, - "rewards/quality_reward_func/std": 0.10033981129527092, - "step": 1280 - }, - { - "completion_length": 45.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 45.5, - "completions/max_terminated_length": 45.5, - "completions/mean_length": 42.35, - "completions/mean_terminated_length": 42.35, - "completions/min_length": 39.2, - "completions/min_terminated_length": 39.2, - "epoch": 0.007819034803796778, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.0, - "kl": 0.5517230717465281, - "learning_rate": 1.6902064545046271e-06, - "loss": 0.0, - "num_tokens": 1698593.0, - "reward": 3.7299851179122925, - "reward_std": 0.07923897914588451, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2599851399660111, - "rewards/quality_reward_func/std": 0.07923896797001362, - "step": 1290 - }, - { - "completion_length": 43.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 43.0, - "completions/max_terminated_length": 43.0, - "completions/mean_length": 38.7, - "completions/mean_terminated_length": 38.7, - "completions/min_length": 35.8, - "completions/min_terminated_length": 35.8, - "epoch": 0.007879647476694427, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.3437905013561249, - "kl": 0.37127360627055167, - "learning_rate": 1.6490511244673752e-06, - "loss": 0.0, - "num_tokens": 1710561.0, - "reward": 4.005517053604126, - "reward_std": 0.0736208476126194, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.505517065525055, - "rewards/quality_reward_func/std": 0.07362081781029702, - "step": 1300 - }, - { - "completion_length": 44.4, - "completions/clipped_ratio": 0.0, - "completions/max_length": 44.4, - "completions/max_terminated_length": 44.4, - "completions/mean_length": 40.475, - "completions/mean_terminated_length": 40.475, - "completions/min_length": 36.3, - "completions/min_terminated_length": 36.3, - "epoch": 0.007940260149592076, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.23188935220241547, - "kl": 0.5163222604052862, - "learning_rate": 1.6081550017009368e-06, - "loss": 0.0, - "num_tokens": 1720332.0, - "reward": 3.5359374046325684, - "reward_std": 0.10876650847494602, - "rewards/coherence_reward_func/mean": 0.9774999976158142, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.4625, - "rewards/formatting_reward_func/std": 0.025, - "rewards/quality_reward_func/mean": 1.0959374070167542, - "rewards/quality_reward_func/std": 0.06876652725040913, - "step": 1310 - }, - { - "completion_length": 41.7, - "completions/clipped_ratio": 0.0, - "completions/max_length": 41.7, - "completions/max_terminated_length": 41.7, - "completions/mean_length": 39.65, - "completions/mean_terminated_length": 39.65, - "completions/min_length": 36.5, - "completions/min_terminated_length": 36.5, - "epoch": 0.008000872822489727, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.16666565835475922, - "kl": 0.5675099290907383, - "learning_rate": 1.5675305435604776e-06, - "loss": 0.0, - "num_tokens": 1731822.0, - "reward": 3.976525831222534, - "reward_std": 0.049343513371422884, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.506525808572769, - "rewards/quality_reward_func/std": 0.049343547155149284, - "step": 1320 - }, - { - "completion_length": 38.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 38.8, - "completions/max_terminated_length": 38.8, - "completions/mean_length": 36.725, - "completions/mean_terminated_length": 36.725, - "completions/min_length": 34.7, - "completions/min_terminated_length": 34.7, - "epoch": 0.008061485495387375, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.22186774015426636, - "kl": 0.7802146021276712, - "learning_rate": 1.5271901246494847e-06, - "loss": 0.0001, - "num_tokens": 1746451.0, - "reward": 3.839250016212463, - "reward_std": 0.13178104758262635, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.369250077009201, - "rewards/quality_reward_func/std": 0.13178106918931007, - "step": 1330 - }, - { - "completion_length": 40.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 40.0, - "completions/max_terminated_length": 40.0, - "completions/mean_length": 37.325, - "completions/mean_terminated_length": 37.325, - "completions/min_length": 34.1, - "completions/min_terminated_length": 34.1, - "epoch": 0.008122098168285024, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.21763364970684052, - "kl": 0.5526682570576668, - "learning_rate": 1.487146033050344e-06, - "loss": 0.0, - "num_tokens": 1760508.0, - "reward": 4.063971328735351, - "reward_std": 0.14593251869082452, - "rewards/coherence_reward_func/mean": 0.9400000005960465, - "rewards/coherence_reward_func/std": 0.019999998807907104, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.6239712476730346, - "rewards/quality_reward_func/std": 0.1288875199854374, - "step": 1340 - }, - { - "completion_length": 35.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 35.9, - "completions/max_terminated_length": 35.9, - "completions/mean_length": 33.575, - "completions/mean_terminated_length": 33.575, - "completions/min_length": 32.1, - "completions/min_terminated_length": 32.1, - "epoch": 0.008182710841182675, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.0, - "kl": 0.567939514759928, - "learning_rate": 1.4474104665812727e-06, - "loss": 0.0, - "num_tokens": 1773383.0, - "reward": 3.6367183208465574, - "reward_std": 0.1654182402882725, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.1367182701826095, - "rewards/quality_reward_func/std": 0.1654182408004999, - "step": 1350 - }, - { - "completion_length": 44.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 44.8, - "completions/max_terminated_length": 44.8, - "completions/mean_length": 41.7, - "completions/mean_terminated_length": 41.7, - "completions/min_length": 38.2, - "completions/min_terminated_length": 38.2, - "epoch": 0.008243323514080324, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.0, - "kl": 0.37540047857910397, - "learning_rate": 1.4079955290807452e-06, - "loss": 0.0, - "num_tokens": 1784331.0, - "reward": 4.0010014295578005, - "reward_std": 0.08053193427622318, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.5010013580322266, - "rewards/quality_reward_func/std": 0.08053191676735878, - "step": 1360 - }, - { - "completion_length": 38.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 38.0, - "completions/max_terminated_length": 38.0, - "completions/mean_length": 36.05, - "completions/mean_terminated_length": 36.05, - "completions/min_length": 33.9, - "completions/min_terminated_length": 33.9, - "epoch": 0.008303936186977973, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.1333545744419098, - "kl": 0.6424828507006168, - "learning_rate": 1.3689132267205432e-06, - "loss": 0.0, - "num_tokens": 1796685.0, - "reward": 3.8992115020751954, - "reward_std": 0.07596561014652252, - "rewards/coherence_reward_func/mean": 0.9099999964237213, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4892114460468293, - "rewards/quality_reward_func/std": 0.07596560940146446, - "step": 1370 - }, - { - "completion_length": 39.3, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.3, - "completions/max_terminated_length": 39.3, - "completions/mean_length": 35.325, - "completions/mean_terminated_length": 35.325, - "completions/min_length": 32.8, - "completions/min_terminated_length": 32.8, - "epoch": 0.008364548859875623, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.4210354685783386, - "kl": 0.562444256618619, - "learning_rate": 1.3301754643485671e-06, - "loss": 0.0, - "num_tokens": 1811034.0, - "reward": 4.093151378631592, - "reward_std": 0.07408938612788915, - "rewards/coherence_reward_func/mean": 0.9850000023841858, - "rewards/coherence_reward_func/std": 0.017320507764816286, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.608151376247406, - "rewards/quality_reward_func/std": 0.05676891524344683, - "step": 1380 - }, - { - "completion_length": 49.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 49.2, - "completions/max_terminated_length": 49.2, - "completions/mean_length": 45.775, - "completions/mean_terminated_length": 45.775, - "completions/min_length": 41.1, - "completions/min_terminated_length": 41.1, - "epoch": 0.008425161532773272, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.22556762397289276, - "kl": 0.5780497211962938, - "learning_rate": 1.2917940418624942e-06, - "loss": 0.0, - "num_tokens": 1824285.0, - "reward": 3.858914279937744, - "reward_std": 0.08644282910972834, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3589142441749573, - "rewards/quality_reward_func/std": 0.08644279632717371, - "step": 1390 - }, - { - "completion_length": 32.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 32.6, - "completions/max_terminated_length": 32.6, - "completions/mean_length": 30.7, - "completions/mean_terminated_length": 30.7, - "completions/min_length": 28.9, - "completions/min_terminated_length": 28.9, - "epoch": 0.008485774205670921, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.22448307275772095, - "kl": 0.6989169731736183, - "learning_rate": 1.2537806506154246e-06, - "loss": 0.0, - "num_tokens": 1836749.0, - "reward": 3.9196971893310546, - "reward_std": 0.06787999146617948, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4196970999240874, - "rewards/quality_reward_func/std": 0.06787999051157385, - "step": 1400 - }, - { - "completion_length": 31.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 31.2, - "completions/max_terminated_length": 31.2, - "completions/mean_length": 29.25, - "completions/mean_terminated_length": 29.25, - "completions/min_length": 27.9, - "completions/min_terminated_length": 27.9, - "epoch": 0.008546386878568572, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.1975327730178833, - "kl": 0.5352459710091353, - "learning_rate": 1.2161468698545755e-06, - "loss": 0.0, - "num_tokens": 1846123.0, - "reward": 3.9435173749923704, - "reward_std": 0.08846498169004917, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4435174494981766, - "rewards/quality_reward_func/std": 0.08846500292420387, - "step": 1410 - }, - { - "completion_length": 45.4, - "completions/clipped_ratio": 0.0, - "completions/max_length": 45.4, - "completions/max_terminated_length": 45.4, - "completions/mean_length": 43.1, - "completions/mean_terminated_length": 43.1, - "completions/min_length": 40.2, - "completions/min_terminated_length": 40.2, - "epoch": 0.00860699955146622, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.0, - "kl": 0.5355475686490536, - "learning_rate": 1.1789041631941326e-06, - "loss": 0.0, - "num_tokens": 1862027.0, - "reward": 3.8648277044296266, - "reward_std": 0.0594564669765532, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3948277711868287, - "rewards/quality_reward_func/std": 0.059456495381891725, - "step": 1420 - }, - { - "completion_length": 37.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 37.5, - "completions/max_terminated_length": 37.5, - "completions/mean_length": 35.1, - "completions/mean_terminated_length": 35.1, - "completions/min_length": 33.0, - "completions/min_terminated_length": 33.0, - "epoch": 0.00866761222436387, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.24863724410533905, - "kl": 0.8889136493206025, - "learning_rate": 1.142063875123323e-06, - "loss": 0.0001, - "num_tokens": 1876483.0, - "reward": 3.8106926679611206, - "reward_std": 0.2503866729326546, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3106926739215852, - "rewards/quality_reward_func/std": 0.25038664806634187, - "step": 1430 - }, - { - "completion_length": 39.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.1, - "completions/max_terminated_length": 39.1, - "completions/mean_length": 36.2, - "completions/mean_terminated_length": 36.2, - "completions/min_length": 33.7, - "completions/min_terminated_length": 33.7, - "epoch": 0.00872822489726152, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.0, - "kl": 0.6508466430008412, - "learning_rate": 1.1056372275507748e-06, - "loss": 0.0, - "num_tokens": 1892155.0, - "reward": 3.9768813371658327, - "reward_std": 0.14665895849466323, - "rewards/coherence_reward_func/mean": 0.9374999940395355, - "rewards/coherence_reward_func/std": 0.03500000238418579, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.5393813669681549, - "rewards/quality_reward_func/std": 0.14688132256269454, - "step": 1440 - }, - { - "completion_length": 43.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 43.9, - "completions/max_terminated_length": 43.9, - "completions/mean_length": 41.0, - "completions/mean_terminated_length": 41.0, - "completions/min_length": 38.0, - "completions/min_terminated_length": 38.0, - "epoch": 0.008788837570159169, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.13631002604961395, - "kl": 0.5113062348216773, - "learning_rate": 1.06963531638621e-06, - "loss": 0.0, - "num_tokens": 1905419.0, - "reward": 3.913432550430298, - "reward_std": 0.052389385527931154, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.413432464003563, - "rewards/quality_reward_func/std": 0.05238937532994896, - "step": 1450 - }, - { - "completion_length": 39.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.5, - "completions/max_terminated_length": 39.5, - "completions/mean_length": 37.95, - "completions/mean_terminated_length": 37.95, - "completions/min_length": 36.8, - "completions/min_terminated_length": 36.8, - "epoch": 0.008849450243056818, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.15441648662090302, - "kl": 0.7064547918736934, - "learning_rate": 1.0340691081605267e-06, - "loss": 0.0, - "num_tokens": 1918485.0, - "reward": 4.05954875946045, - "reward_std": 0.0508176582865417, - "rewards/coherence_reward_func/mean": 0.9399999976158142, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.6195487260818482, - "rewards/quality_reward_func/std": 0.050817657727748156, - "step": 1460 - }, - { - "completion_length": 38.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 38.9, - "completions/max_terminated_length": 38.9, - "completions/mean_length": 37.075, - "completions/mean_terminated_length": 37.075, - "completions/min_length": 34.7, - "completions/min_terminated_length": 34.7, - "epoch": 0.008910062915954468, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.0, - "kl": 0.6027115270495415, - "learning_rate": 9.989494366852904e-07, - "loss": 0.0, - "num_tokens": 1931300.0, - "reward": 3.90128378868103, - "reward_std": 0.06042906288057566, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.431283837556839, - "rewards/quality_reward_func/std": 0.060429084859788415, - "step": 1470 - }, - { - "completion_length": 36.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 36.1, - "completions/max_terminated_length": 36.1, - "completions/mean_length": 34.025, - "completions/mean_terminated_length": 34.025, - "completions/min_length": 32.2, - "completions/min_terminated_length": 32.2, - "epoch": 0.008970675588852117, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.3705846667289734, - "kl": 0.7139816895127297, - "learning_rate": 9.64286999752642e-07, - "loss": 0.0, - "num_tokens": 1943813.0, - "reward": 3.6943301439285277, - "reward_std": 0.033519055764190855, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.1943301856517792, - "rewards/quality_reward_func/std": 0.03351905785966665, - "step": 1480 - }, - { - "completion_length": 44.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 44.9, - "completions/max_terminated_length": 44.9, - "completions/mean_length": 41.8, - "completions/mean_terminated_length": 41.8, - "completions/min_length": 38.6, - "completions/min_terminated_length": 38.6, - "epoch": 0.009031288261749766, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.27768024802207947, - "kl": 0.4181586863473058, - "learning_rate": 9.300923558766556e-07, - "loss": 0.0, - "num_tokens": 1959013.0, - "reward": 3.974554419517517, - "reward_std": 0.06308290679007769, - "rewards/coherence_reward_func/mean": 0.9600000023841858, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.5145544350147246, - "rewards/quality_reward_func/std": 0.06308291982859374, - "step": 1490 - }, - { - "completion_length": 41.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 41.9, - "completions/max_terminated_length": 41.9, - "completions/mean_length": 38.725, - "completions/mean_terminated_length": 38.725, - "completions/min_length": 36.0, - "completions/min_terminated_length": 36.0, - "epoch": 0.009091900934647417, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.0, - "kl": 0.6932669762521982, - "learning_rate": 8.963759210771053e-07, - "loss": 0.0, - "num_tokens": 1974274.0, - "reward": 3.685367155075073, - "reward_std": 0.14445191919803618, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.215367192029953, - "rewards/quality_reward_func/std": 0.14445189237594605, - "step": 1500 - }, - { - "completion_length": 39.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.5, - "completions/max_terminated_length": 39.5, - "completions/mean_length": 36.95, - "completions/mean_terminated_length": 36.95, - "completions/min_length": 34.5, - "completions/min_terminated_length": 34.5, - "epoch": 0.009152513607545066, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.0, - "kl": 0.6094869062304497, - "learning_rate": 8.631479657066508e-07, - "loss": 0.0, - "num_tokens": 1987620.0, - "reward": 3.6833661794662476, - "reward_std": 0.05165445755701512, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.213366150856018, - "rewards/quality_reward_func/std": 0.0516545019345358, - "step": 1510 - }, - { - "completion_length": 39.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.0, - "completions/max_terminated_length": 39.0, - "completions/mean_length": 36.525, - "completions/mean_terminated_length": 36.525, - "completions/min_length": 34.4, - "completions/min_terminated_length": 34.4, - "epoch": 0.009213126280442714, - "frac_reward_zero_std": 0.0, - "grad_norm": 0.23846538364887238, - "kl": 0.6182480398565531, - "learning_rate": 8.304186113223839e-07, - "loss": 0.0, - "num_tokens": 1998317.0, - "reward": 3.836996817588806, - "reward_std": 0.14115408007055522, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3669968128204346, - "rewards/quality_reward_func/std": 0.14115412402898073, - "step": 1520 - }, - { - "completion_length": 37.4, - "completions/clipped_ratio": 0.0, - "completions/max_length": 37.4, - "completions/max_terminated_length": 37.4, - "completions/mean_length": 35.3, - "completions/mean_terminated_length": 35.3, - "completions/min_length": 33.8, - "completions/min_terminated_length": 33.8, - "epoch": 0.009273738953340365, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.14137135446071625, - "kl": 0.4990082811564207, - "learning_rate": 7.981978276027055e-07, - "loss": 0.0, - "num_tokens": 2008041.0, - "reward": 3.8615764141082765, - "reward_std": 0.04395291493274271, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3615763664245606, - "rewards/quality_reward_func/std": 0.04395288261584938, - "step": 1530 - }, - { - "completion_length": 34.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 34.1, - "completions/max_terminated_length": 34.1, - "completions/mean_length": 31.825, - "completions/mean_terminated_length": 31.825, - "completions/min_length": 30.1, - "completions/min_terminated_length": 30.1, - "epoch": 0.009334351626238014, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.2518616020679474, - "kl": 0.7289800090715289, - "learning_rate": 7.664954293104674e-07, - "loss": 0.0, - "num_tokens": 2019978.0, - "reward": 3.956069016456604, - "reward_std": 0.03467703033238649, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.456069028377533, - "rewards/quality_reward_func/std": 0.03467705752700567, - "step": 1540 - }, - { - "completion_length": 43.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 43.1, - "completions/max_terminated_length": 43.1, - "completions/mean_length": 39.525, - "completions/mean_terminated_length": 39.525, - "completions/min_length": 36.4, - "completions/min_terminated_length": 36.4, - "epoch": 0.009394964299135663, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.0, - "kl": 0.5346631288528443, - "learning_rate": 7.353210733032976e-07, - "loss": 0.0, - "num_tokens": 2033019.0, - "reward": 3.802832555770874, - "reward_std": 0.10494014797732235, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3028325080871581, - "rewards/quality_reward_func/std": 0.10494015598669648, - "step": 1550 - }, - { - "completion_length": 37.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 37.0, - "completions/max_terminated_length": 37.0, - "completions/mean_length": 33.625, - "completions/mean_terminated_length": 33.625, - "completions/min_length": 30.1, - "completions/min_terminated_length": 30.1, - "epoch": 0.009455576972033313, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.14377595484256744, - "kl": 0.7199880983680487, - "learning_rate": 7.046842555920283e-07, - "loss": 0.0, - "num_tokens": 2045096.0, - "reward": 3.6447502851486204, - "reward_std": 0.07920413194224238, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.1447503209114074, - "rewards/quality_reward_func/std": 0.0792041715234518, - "step": 1560 - }, - { - "completion_length": 39.4, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.4, - "completions/max_terminated_length": 39.4, - "completions/mean_length": 36.35, - "completions/mean_terminated_length": 36.35, - "completions/min_length": 34.3, - "completions/min_terminated_length": 34.3, - "epoch": 0.009516189644930962, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.0, - "kl": 0.6046067669987678, - "learning_rate": 6.74594308448119e-07, - "loss": 0.0, - "num_tokens": 2060590.0, - "reward": 3.957841491699219, - "reward_std": 0.08803778495639562, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4878414213657378, - "rewards/quality_reward_func/std": 0.08803779818117619, - "step": 1570 - }, - { - "completion_length": 43.3, - "completions/clipped_ratio": 0.0, - "completions/max_length": 43.3, - "completions/max_terminated_length": 43.3, - "completions/mean_length": 39.55, - "completions/mean_terminated_length": 39.55, - "completions/min_length": 36.3, - "completions/min_terminated_length": 36.3, - "epoch": 0.009576802317828611, - "frac_reward_zero_std": 0.0, - "grad_norm": 0.14253489673137665, - "kl": 0.5395171521231532, - "learning_rate": 6.450603975609593e-07, - "loss": 0.0, - "num_tokens": 2073168.0, - "reward": 3.7219276428222656, - "reward_std": 0.1262262837495655, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2519275963306427, - "rewards/quality_reward_func/std": 0.1262262668926269, - "step": 1580 - }, - { - "completion_length": 41.3, - "completions/clipped_ratio": 0.0, - "completions/max_length": 41.3, - "completions/max_terminated_length": 41.3, - "completions/mean_length": 37.6, - "completions/mean_terminated_length": 37.6, - "completions/min_length": 34.6, - "completions/min_terminated_length": 34.6, - "epoch": 0.009637414990726262, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.31045860052108765, - "kl": 0.5697082489728927, - "learning_rate": 6.160915192459058e-07, - "loss": 0.0, - "num_tokens": 2084252.0, - "reward": 3.898610806465149, - "reward_std": 0.08470682427287102, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3986108422279357, - "rewards/quality_reward_func/std": 0.08470681160688401, - "step": 1590 - }, - { - "completion_length": 33.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 33.9, - "completions/max_terminated_length": 33.9, - "completions/mean_length": 31.875, - "completions/mean_terminated_length": 31.875, - "completions/min_length": 30.0, - "completions/min_terminated_length": 30.0, - "epoch": 0.00969802766362391, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.0, - "kl": 0.616860269382596, - "learning_rate": 5.876964977039207e-07, - "loss": 0.0, - "num_tokens": 2097939.0, - "reward": 3.850436878204346, - "reward_std": 0.09563049608841538, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3504369378089904, - "rewards/quality_reward_func/std": 0.09563046777620912, - "step": 1600 - }, - { - "completion_length": 35.4, - "completions/clipped_ratio": 0.0, - "completions/max_length": 35.4, - "completions/max_terminated_length": 35.4, - "completions/mean_length": 34.25, - "completions/mean_terminated_length": 34.25, - "completions/min_length": 32.9, - "completions/min_terminated_length": 32.9, - "epoch": 0.00975864033652156, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.0, - "kl": 0.7023968700319528, - "learning_rate": 5.598839823336349e-07, - "loss": 0.0, - "num_tokens": 2110453.0, - "reward": 3.7349292755126955, - "reward_std": 0.11707438752055169, - "rewards/coherence_reward_func/mean": 0.95, - "rewards/coherence_reward_func/std": 0.02309400886297226, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2849293231964112, - "rewards/quality_reward_func/std": 0.10650582872331142, - "step": 1610 - }, - { - "completion_length": 46.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 46.8, - "completions/max_terminated_length": 46.8, - "completions/mean_length": 43.05, - "completions/mean_terminated_length": 43.05, - "completions/min_length": 39.2, - "completions/min_terminated_length": 39.2, - "epoch": 0.00981925300941921, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.22065599262714386, - "kl": 0.5216379288583994, - "learning_rate": 5.32662445096657e-07, - "loss": 0.0, - "num_tokens": 2122999.0, - "reward": 3.765179705619812, - "reward_std": 0.0852081986842677, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2951796174049377, - "rewards/quality_reward_func/std": 0.08520822711288929, - "step": 1620 - }, - { - "completion_length": 36.7, - "completions/clipped_ratio": 0.0, - "completions/max_length": 36.7, - "completions/max_terminated_length": 36.7, - "completions/mean_length": 34.775, - "completions/mean_terminated_length": 34.775, - "completions/min_length": 31.9, - "completions/min_terminated_length": 31.9, - "epoch": 0.009879865682316859, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.344510018825531, - "kl": 0.7621192060410976, - "learning_rate": 5.060401779369292e-07, - "loss": 0.0, - "num_tokens": 2135490.0, - "reward": 3.628094935417175, - "reward_std": 0.08223413676023483, - "rewards/coherence_reward_func/mean": 0.9700000047683716, - "rewards/coherence_reward_func/std": 0.020000000298023225, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.1580949008464814, - "rewards/quality_reward_func/std": 0.10099657773971557, - "step": 1630 - }, - { - "completion_length": 39.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.9, - "completions/max_terminated_length": 39.9, - "completions/mean_length": 37.175, - "completions/mean_terminated_length": 37.175, - "completions/min_length": 34.1, - "completions/min_terminated_length": 34.1, - "epoch": 0.009940478355214508, - "frac_reward_zero_std": 0.0, - "grad_norm": 0.18226611614227295, - "kl": 0.574143086373806, - "learning_rate": 4.800252902549243e-07, - "loss": 0.0, - "num_tokens": 2146681.0, - "reward": 3.851472806930542, - "reward_std": 0.12666394859552382, - "rewards/coherence_reward_func/mean": 0.9024999976158142, - "rewards/coherence_reward_func/std": 0.0723205104470253, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4489728450775146, - "rewards/quality_reward_func/std": 0.07018067184835672, - "step": 1640 - }, - { - "completion_length": 34.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 34.6, - "completions/max_terminated_length": 34.6, - "completions/mean_length": 30.7, - "completions/mean_terminated_length": 30.7, - "completions/min_length": 28.0, - "completions/min_terminated_length": 28.0, - "epoch": 0.010001091028112157, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.3307839035987854, - "kl": 0.6738088440150023, - "learning_rate": 4.54625706437441e-07, - "loss": 0.0, - "num_tokens": 2160213.0, - "reward": 3.631506657600403, - "reward_std": 0.06585814524441957, - "rewards/coherence_reward_func/mean": 0.9600000023841858, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.1715066727250814, - "rewards/quality_reward_func/std": 0.0658581605181098, - "step": 1650 - }, - { - "completion_length": 44.7, - "completions/clipped_ratio": 0.0, - "completions/max_length": 44.7, - "completions/max_terminated_length": 44.7, - "completions/mean_length": 39.075, - "completions/mean_terminated_length": 39.075, - "completions/min_length": 35.2, - "completions/min_terminated_length": 35.2, - "epoch": 0.010061703701009807, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.0, - "kl": 0.6326223069801926, - "learning_rate": 4.2984916344376404e-07, - "loss": 0.0, - "num_tokens": 2172604.0, - "reward": 3.765461468696594, - "reward_std": 0.09697454781271517, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2954614877700805, - "rewards/quality_reward_func/std": 0.09697456057183444, - "step": 1660 - }, - { - "completion_length": 34.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 34.9, - "completions/max_terminated_length": 34.9, - "completions/mean_length": 32.075, - "completions/mean_terminated_length": 32.075, - "completions/min_length": 29.1, - "completions/min_terminated_length": 29.1, - "epoch": 0.010122316373907456, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.37101298570632935, - "kl": 0.8005252219736576, - "learning_rate": 4.057032084489032e-07, - "loss": 0.0001, - "num_tokens": 2184907.0, - "reward": 3.829305052757263, - "reward_std": 0.10471167676150799, - "rewards/coherence_reward_func/mean": 0.9300000011920929, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.399305048584938, - "rewards/quality_reward_func/std": 0.104711680021137, - "step": 1670 - }, - { - "completion_length": 29.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 29.2, - "completions/max_terminated_length": 29.2, - "completions/mean_length": 28.625, - "completions/mean_terminated_length": 28.625, - "completions/min_length": 27.8, - "completions/min_terminated_length": 27.8, - "epoch": 0.010182929046805105, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.0, - "kl": 0.843848580494523, - "learning_rate": 3.821951965446577e-07, - "loss": 0.0, - "num_tokens": 2196228.0, - "reward": 4.054256296157837, - "reward_std": 0.07670944058336318, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.5842562794685364, - "rewards/quality_reward_func/std": 0.07670940482057631, - "step": 1680 - }, - { - "completion_length": 34.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 34.6, - "completions/max_terminated_length": 34.6, - "completions/mean_length": 33.175, - "completions/mean_terminated_length": 33.175, - "completions/min_length": 31.7, - "completions/min_terminated_length": 31.7, - "epoch": 0.010243541719702756, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.0, - "kl": 0.7569024676457048, - "learning_rate": 3.5933228849917956e-07, - "loss": 0.0, - "num_tokens": 2210123.0, - "reward": 3.5791541814804075, - "reward_std": 0.08816965823061765, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.0791541069746018, - "rewards/quality_reward_func/std": 0.08816969217732548, - "step": 1690 - }, - { - "completion_length": 31.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 31.0, - "completions/max_terminated_length": 31.0, - "completions/mean_length": 29.6, - "completions/mean_terminated_length": 29.6, - "completions/min_length": 28.3, - "completions/min_terminated_length": 28.3, - "epoch": 0.010304154392600405, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.0, - "kl": 0.8379929825663567, - "learning_rate": 3.371214485757393e-07, - "loss": 0.0, - "num_tokens": 2223643.0, - "reward": 3.87547652721405, - "reward_std": 0.08642592094838619, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.375476497411728, - "rewards/quality_reward_func/std": 0.08642595596611499, - "step": 1700 - }, - { - "completion_length": 36.4, - "completions/clipped_ratio": 0.0, - "completions/max_length": 36.4, - "completions/max_terminated_length": 36.4, - "completions/mean_length": 33.425, - "completions/mean_terminated_length": 33.425, - "completions/min_length": 31.3, - "completions/min_terminated_length": 31.3, - "epoch": 0.010364767065498054, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.0, - "kl": 0.6557522185146809, - "learning_rate": 3.1556944241133704e-07, - "loss": 0.0, - "num_tokens": 2235108.0, - "reward": 3.609204316139221, - "reward_std": 0.0684462774777785, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.1092043042182922, - "rewards/quality_reward_func/std": 0.06844626909587533, - "step": 1710 - }, - { - "completion_length": 46.4, - "completions/clipped_ratio": 0.0, - "completions/max_length": 46.4, - "completions/max_terminated_length": 46.4, - "completions/mean_length": 42.05, - "completions/mean_terminated_length": 42.05, - "completions/min_length": 37.4, - "completions/min_terminated_length": 37.4, - "epoch": 0.010425379738395704, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.3153176009654999, - "kl": 0.6560196654871107, - "learning_rate": 2.946828349558309e-07, - "loss": 0.0, - "num_tokens": 2247578.0, - "reward": 3.829335641860962, - "reward_std": 0.13396261632442474, - "rewards/coherence_reward_func/mean": 0.9925000011920929, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3368355989456178, - "rewards/quality_reward_func/std": 0.12680764831602573, - "step": 1720 - }, - { - "completion_length": 37.7, - "completions/clipped_ratio": 0.0, - "completions/max_length": 37.7, - "completions/max_terminated_length": 37.7, - "completions/mean_length": 33.175, - "completions/mean_terminated_length": 33.175, - "completions/min_length": 29.8, - "completions/min_terminated_length": 29.8, - "epoch": 0.010485992411293353, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.0, - "kl": 0.7409754022955894, - "learning_rate": 2.7446798847218376e-07, - "loss": 0.0, - "num_tokens": 2258517.0, - "reward": 3.9572421073913575, - "reward_std": 0.12505322322249413, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4872421324253082, - "rewards/quality_reward_func/std": 0.12505321204662323, - "step": 1730 - }, - { - "completion_length": 30.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 30.6, - "completions/max_terminated_length": 30.6, - "completions/mean_length": 28.95, - "completions/mean_terminated_length": 28.95, - "completions/min_length": 27.0, - "completions/min_terminated_length": 27.0, - "epoch": 0.010546605084191002, - "frac_reward_zero_std": 0.5, - "grad_norm": 0.0, - "kl": 1.0303468838334084, - "learning_rate": 2.549310605984612e-07, - "loss": 0.0001, - "num_tokens": 2271423.0, - "reward": 3.6396912336349487, - "reward_std": 0.11047207750380039, - "rewards/coherence_reward_func/mean": 0.9300000011920929, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.209691232442856, - "rewards/quality_reward_func/std": 0.11047208420932293, - "step": 1740 - }, - { - "completion_length": 31.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 31.5, - "completions/max_terminated_length": 31.5, - "completions/mean_length": 29.05, - "completions/mean_terminated_length": 29.05, - "completions/min_length": 27.1, - "completions/min_terminated_length": 27.1, - "epoch": 0.010607217757088653, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.26000910997390747, - "kl": 0.6540774557739496, - "learning_rate": 2.360780024721515e-07, - "loss": 0.0, - "num_tokens": 2282025.0, - "reward": 3.900636911392212, - "reward_std": 0.08529032468795776, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4006368696689606, - "rewards/quality_reward_func/std": 0.08529035337269306, - "step": 1750 - }, - { - "completion_length": 41.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 41.1, - "completions/max_terminated_length": 41.1, - "completions/mean_length": 35.1, - "completions/mean_terminated_length": 35.1, - "completions/min_length": 31.1, - "completions/min_terminated_length": 31.1, - "epoch": 0.010667830429986301, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.23310041427612305, - "kl": 0.58091857470572, - "learning_rate": 2.1791455691739323e-07, - "loss": 0.0, - "num_tokens": 2295441.0, - "reward": 3.782832479476929, - "reward_std": 0.11690248586237431, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3128324687480926, - "rewards/quality_reward_func/std": 0.1169025051407516, - "step": 1760 - }, - { - "completion_length": 32.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 32.5, - "completions/max_terminated_length": 32.5, - "completions/mean_length": 30.175, - "completions/mean_terminated_length": 30.175, - "completions/min_length": 27.7, - "completions/min_terminated_length": 27.7, - "epoch": 0.01072844310288395, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.17846845090389252, - "kl": 0.6914813637733459, - "learning_rate": 2.0044625669565582e-07, - "loss": 0.0, - "num_tokens": 2309340.0, - "reward": 3.7306325674057006, - "reward_std": 0.030673326179385185, - "rewards/coherence_reward_func/mean": 0.9399999976158142, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2906325876712799, - "rewards/quality_reward_func/std": 0.030673368694260718, - "step": 1770 - }, - { - "completion_length": 32.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 32.6, - "completions/max_terminated_length": 32.6, - "completions/mean_length": 29.3, - "completions/mean_terminated_length": 29.3, - "completions/min_length": 26.2, - "completions/min_terminated_length": 26.2, - "epoch": 0.010789055775781601, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.20088432729244232, - "kl": 0.7156840384704992, - "learning_rate": 1.8367842282040692e-07, - "loss": 0.0, - "num_tokens": 2322048.0, - "reward": 4.005929350852966, - "reward_std": 0.0838471569120884, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.5059293925762176, - "rewards/quality_reward_func/std": 0.08384712897241116, - "step": 1780 - }, - { - "completion_length": 32.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 32.0, - "completions/max_terminated_length": 32.0, - "completions/mean_length": 31.175, - "completions/mean_terminated_length": 31.175, - "completions/min_length": 30.4, - "completions/min_terminated_length": 30.4, - "epoch": 0.01084966844867925, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.27307385206222534, - "kl": 0.5283912677317858, - "learning_rate": 1.676161629362777e-07, - "loss": 0.0, - "num_tokens": 2333647.0, - "reward": 3.857843279838562, - "reward_std": 0.15851602412876672, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3578432589769363, - "rewards/quality_reward_func/std": 0.1585160527320113, - "step": 1790 - }, - { - "completion_length": 35.4, - "completions/clipped_ratio": 0.0, - "completions/max_length": 35.4, - "completions/max_terminated_length": 35.4, - "completions/mean_length": 32.4, - "completions/mean_terminated_length": 32.4, - "completions/min_length": 29.5, - "completions/min_terminated_length": 29.5, - "epoch": 0.010910281121576899, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.1520412713289261, - "kl": 0.7610108926892281, - "learning_rate": 1.5226436976322728e-07, - "loss": 0.0, - "num_tokens": 2344143.0, - "reward": 3.615111494064331, - "reward_std": 0.15508272759616376, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.1151114344596862, - "rewards/quality_reward_func/std": 0.15508267749100924, - "step": 1800 - }, - { - "completion_length": 39.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.2, - "completions/max_terminated_length": 39.2, - "completions/mean_length": 36.375, - "completions/mean_terminated_length": 36.375, - "completions/min_length": 33.7, - "completions/min_terminated_length": 33.7, - "epoch": 0.01097089379447455, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.31795915961265564, - "kl": 0.5607297398149967, - "learning_rate": 1.3762771960617315e-07, - "loss": 0.0, - "num_tokens": 2353486.0, - "reward": 3.794522213935852, - "reward_std": 0.15094572885427623, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.294522190093994, - "rewards/quality_reward_func/std": 0.15094575015828013, - "step": 1810 - }, - { - "completion_length": 39.4, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.4, - "completions/max_terminated_length": 39.4, - "completions/mean_length": 35.175, - "completions/mean_terminated_length": 35.175, - "completions/min_length": 30.8, - "completions/min_terminated_length": 30.8, - "epoch": 0.011031506467372198, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.0, - "kl": 0.673710684850812, - "learning_rate": 1.237106709305408e-07, - "loss": 0.0, - "num_tokens": 2365581.0, - "reward": 3.7541088104248046, - "reward_std": 0.10031453329138458, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2841087639331819, - "rewards/quality_reward_func/std": 0.10031452160328627, - "step": 1820 - }, - { - "completion_length": 40.4, - "completions/clipped_ratio": 0.0, - "completions/max_length": 40.4, - "completions/max_terminated_length": 40.4, - "completions/mean_length": 38.425, - "completions/mean_terminated_length": 38.425, - "completions/min_length": 37.2, - "completions/min_terminated_length": 37.2, - "epoch": 0.011092119140269847, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.19342079758644104, - "kl": 0.6287215381860733, - "learning_rate": 1.105174630041747e-07, - "loss": 0.0, - "num_tokens": 2377294.0, - "reward": 3.912764286994934, - "reward_std": 0.0975728084333241, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.442764151096344, - "rewards/quality_reward_func/std": 0.09757281136699021, - "step": 1830 - }, - { - "completion_length": 43.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 43.5, - "completions/max_terminated_length": 43.5, - "completions/mean_length": 40.625, - "completions/mean_terminated_length": 40.625, - "completions/min_length": 38.5, - "completions/min_terminated_length": 38.5, - "epoch": 0.011152731813167498, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.17117376625537872, - "kl": 0.6285104263573885, - "learning_rate": 9.805211460601455e-08, - "loss": 0.0001, - "num_tokens": 2391787.0, - "reward": 3.9878787994384766, - "reward_std": 0.10013104230165482, - "rewards/coherence_reward_func/mean": 0.9550000011920929, - "rewards/coherence_reward_func/std": 0.017320507764816286, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.5328787684440612, - "rewards/quality_reward_func/std": 0.08310010395944119, - "step": 1840 - }, - { - "completion_length": 38.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 38.0, - "completions/max_terminated_length": 38.0, - "completions/mean_length": 35.05, - "completions/mean_terminated_length": 35.05, - "completions/min_length": 32.3, - "completions/min_terminated_length": 32.3, - "epoch": 0.011213344486065147, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.20593678951263428, - "kl": 0.6582082536071538, - "learning_rate": 8.631842280193759e-08, - "loss": 0.0001, - "num_tokens": 2403377.0, - "reward": 3.797930192947388, - "reward_std": 0.11260090973228216, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2979300945997239, - "rewards/quality_reward_func/std": 0.11260092593729495, - "step": 1850 - }, - { - "completion_length": 46.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 46.0, - "completions/max_terminated_length": 46.0, - "completions/mean_length": 39.6, - "completions/mean_terminated_length": 39.6, - "completions/min_length": 33.1, - "completions/min_terminated_length": 33.1, - "epoch": 0.011273957158962795, - "frac_reward_zero_std": 0.0, - "grad_norm": 0.4108888804912567, - "kl": 0.4934998281300068, - "learning_rate": 7.531996178813311e-08, - "loss": 0.0, - "num_tokens": 2413069.0, - "reward": 3.732119011878967, - "reward_std": 0.2216266430914402, - "rewards/coherence_reward_func/mean": 0.9474999994039536, - "rewards/coherence_reward_func/std": 0.020615528523921966, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2846189737319946, - "rewards/quality_reward_func/std": 0.201740912348032, - "step": 1860 - }, - { - "completion_length": 41.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 41.6, - "completions/max_terminated_length": 41.6, - "completions/mean_length": 38.8, - "completions/mean_terminated_length": 38.8, - "completions/min_length": 36.0, - "completions/min_terminated_length": 36.0, - "epoch": 0.011334569831860446, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.13784286379814148, - "kl": 0.5053746487945319, - "learning_rate": 6.506008180237111e-08, - "loss": 0.0, - "num_tokens": 2421997.0, - "reward": 3.7463685512542724, - "reward_std": 0.1127361407270655, - "rewards/coherence_reward_func/mean": 0.9925000011920929, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.253868579864502, - "rewards/quality_reward_func/std": 0.0982740418985486, - "step": 1870 - }, - { - "completion_length": 33.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 33.9, - "completions/max_terminated_length": 33.9, - "completions/mean_length": 32.225, - "completions/mean_terminated_length": 32.225, - "completions/min_length": 30.5, - "completions/min_terminated_length": 30.5, - "epoch": 0.011395182504758095, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.13122650980949402, - "kl": 0.6184401633683592, - "learning_rate": 5.554190810348442e-08, - "loss": 0.0, - "num_tokens": 2434906.0, - "reward": 3.970405387878418, - "reward_std": 0.07737901676446199, - "rewards/coherence_reward_func/mean": 0.9774999976158142, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.492905354499817, - "rewards/quality_reward_func/std": 0.07734216339886188, - "step": 1880 - }, - { - "completion_length": 34.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 34.9, - "completions/max_terminated_length": 34.9, - "completions/mean_length": 32.325, - "completions/mean_terminated_length": 32.325, - "completions/min_length": 30.5, - "completions/min_terminated_length": 30.5, - "epoch": 0.011455795177655744, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.0, - "kl": 0.7600609693676234, - "learning_rate": 4.676834001938718e-08, - "loss": 0.0001, - "num_tokens": 2446527.0, - "reward": 3.784378004074097, - "reward_std": 0.07386668361723422, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.284377983212471, - "rewards/quality_reward_func/std": 0.07386667430400848, - "step": 1890 - }, - { - "completion_length": 44.7, - "completions/clipped_ratio": 0.0, - "completions/max_length": 44.7, - "completions/max_terminated_length": 44.7, - "completions/mean_length": 42.025, - "completions/mean_terminated_length": 42.025, - "completions/min_length": 40.0, - "completions/min_terminated_length": 40.0, - "epoch": 0.011516407850553394, - "frac_reward_zero_std": 0.0, - "grad_norm": 0.19688816368579865, - "kl": 0.5642052394337952, - "learning_rate": 3.874205006390852e-08, - "loss": 0.0, - "num_tokens": 2460284.0, - "reward": 3.886904716491699, - "reward_std": 0.12768367086537183, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4169047951698304, - "rewards/quality_reward_func/std": 0.12768373372964562, - "step": 1900 - }, - { - "completion_length": 37.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 37.6, - "completions/max_terminated_length": 37.6, - "completions/mean_length": 34.3, - "completions/mean_terminated_length": 34.3, - "completions/min_length": 31.1, - "completions/min_terminated_length": 31.1, - "epoch": 0.011577020523451043, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.0, - "kl": 0.6913503933697939, - "learning_rate": 3.146548312272152e-08, - "loss": 0.0, - "num_tokens": 2472668.0, - "reward": 3.822887682914734, - "reward_std": 0.13587198304012418, - "rewards/coherence_reward_func/mean": 0.9574999988079071, - "rewards/coherence_reward_func/std": 0.05541452020406723, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3653876781463623, - "rewards/quality_reward_func/std": 0.09212267585098743, - "step": 1910 - }, - { - "completion_length": 37.7, - "completions/clipped_ratio": 0.0, - "completions/max_length": 37.7, - "completions/max_terminated_length": 37.7, - "completions/mean_length": 34.7, - "completions/mean_terminated_length": 34.7, - "completions/min_length": 31.5, - "completions/min_terminated_length": 31.5, - "epoch": 0.011637633196348692, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.14820994436740875, - "kl": 0.5970391035079956, - "learning_rate": 2.494085570860616e-08, - "loss": 0.0, - "num_tokens": 2485768.0, - "reward": 3.9615900039672853, - "reward_std": 0.10947830174118281, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4615899622440338, - "rewards/quality_reward_func/std": 0.10947828963398934, - "step": 1920 - }, - { - "completion_length": 37.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 37.0, - "completions/max_terminated_length": 37.0, - "completions/mean_length": 33.425, - "completions/mean_terminated_length": 33.425, - "completions/min_length": 29.7, - "completions/min_terminated_length": 29.7, - "epoch": 0.011698245869246343, - "frac_reward_zero_std": 0.0, - "grad_norm": 0.36690980195999146, - "kl": 0.6565518420189619, - "learning_rate": 1.91701552862783e-08, - "loss": 0.0, - "num_tokens": 2501817.0, - "reward": 3.76883327960968, - "reward_std": 0.12354036383330821, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.268833178281784, - "rewards/quality_reward_func/std": 0.12354034576565028, - "step": 1930 - }, - { - "completion_length": 41.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 41.6, - "completions/max_terminated_length": 41.6, - "completions/mean_length": 39.525, - "completions/mean_terminated_length": 39.525, - "completions/min_length": 37.3, - "completions/min_terminated_length": 37.3, - "epoch": 0.011758858542143992, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.22181129455566406, - "kl": 0.6917136002331972, - "learning_rate": 1.4155139666988393e-08, - "loss": 0.0, - "num_tokens": 2515062.0, - "reward": 3.9052948474884035, - "reward_std": 0.1158434453420341, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4052947521209718, - "rewards/quality_reward_func/std": 0.11584346173331142, - "step": 1940 - }, - { - "completion_length": 35.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 35.8, - "completions/max_terminated_length": 35.8, - "completions/mean_length": 32.675, - "completions/mean_terminated_length": 32.675, - "completions/min_length": 30.2, - "completions/min_terminated_length": 30.2, - "epoch": 0.01181947121504164, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.43130362033843994, - "kl": 0.555920683965087, - "learning_rate": 9.897336473076168e-09, - "loss": 0.0, - "num_tokens": 2527225.0, - "reward": 3.914797306060791, - "reward_std": 0.12384454580023885, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4147973597049712, - "rewards/quality_reward_func/std": 0.1238445309922099, - "step": 1950 - }, - { - "completion_length": 34.7, - "completions/clipped_ratio": 0.0, - "completions/max_length": 34.7, - "completions/max_terminated_length": 34.7, - "completions/mean_length": 32.2, - "completions/mean_terminated_length": 32.2, - "completions/min_length": 30.0, - "completions/min_terminated_length": 30.0, - "epoch": 0.011880083887939291, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.0, - "kl": 0.7348119537346065, - "learning_rate": 6.398042672640104e-09, - "loss": 0.0, - "num_tokens": 2537581.0, - "reward": 3.8622239351272585, - "reward_std": 0.07123937433352694, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.362224006652832, - "rewards/quality_reward_func/std": 0.07123938724398612, - "step": 1960 - }, - { - "completion_length": 36.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 36.1, - "completions/max_terminated_length": 36.1, - "completions/mean_length": 34.6, - "completions/mean_terminated_length": 34.6, - "completions/min_length": 32.5, - "completions/min_terminated_length": 32.5, - "epoch": 0.01194069656083694, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.30925706028938293, - "kl": 0.8349323466420173, - "learning_rate": 3.6583241844706517e-09, - "loss": 0.0001, - "num_tokens": 2553969.0, - "reward": 3.897159743309021, - "reward_std": 0.07248044284060598, - "rewards/coherence_reward_func/mean": 0.9400000005960465, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4571597695350647, - "rewards/quality_reward_func/std": 0.07248042160645127, - "step": 1970 - }, - { - "completion_length": 39.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.5, - "completions/max_terminated_length": 39.5, - "completions/mean_length": 37.425, - "completions/mean_terminated_length": 37.425, - "completions/min_length": 35.9, - "completions/min_terminated_length": 35.9, - "epoch": 0.012001309233734589, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.0, - "kl": 0.5941752482205629, - "learning_rate": 1.6790155533594198e-09, - "loss": 0.0, - "num_tokens": 2567382.0, - "reward": 3.8760265350341796, - "reward_std": 0.07388950139284134, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.40602650642395, - "rewards/quality_reward_func/std": 0.07388949021697044, - "step": 1980 - }, - { - "completion_length": 32.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 32.6, - "completions/max_terminated_length": 32.6, - "completions/mean_length": 29.95, - "completions/mean_terminated_length": 29.95, - "completions/min_length": 28.2, - "completions/min_terminated_length": 28.2, - "epoch": 0.01206192190663224, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.0, - "kl": 0.5642943378537894, - "learning_rate": 4.6071969588945555e-10, - "loss": 0.0, - "num_tokens": 2578988.0, - "reward": 3.837187123298645, - "reward_std": 0.10498771369457245, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3371871441602707, - "rewards/quality_reward_func/std": 0.10498773269355297, - "step": 1990 - }, - { - "completion_length": 34.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 34.0, - "completions/max_terminated_length": 34.0, - "completions/mean_length": 31.025, - "completions/mean_terminated_length": 31.025, - "completions/min_length": 27.6, - "completions/min_terminated_length": 27.6, - "epoch": 0.012122534579529888, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.2630312740802765, - "kl": 0.6248931862413883, - "learning_rate": 3.807716780768189e-12, - "loss": 0.0, - "num_tokens": 2592601.0, - "reward": 3.54885847568512, - "reward_std": 0.1354080844670534, - "rewards/coherence_reward_func/mean": 0.9774999976158142, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.071358424425125, - "rewards/quality_reward_func/std": 0.12041439302265644, - "step": 2000 - }, - { - "completion_length": 38.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 38.1, - "completions/max_terminated_length": 38.1, - "completions/mean_length": 35.625, - "completions/mean_terminated_length": 35.625, - "completions/min_length": 33.3, - "completions/min_terminated_length": 33.3, - "epoch": 0.012183147252427537, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.0, - "kl": 0.527397221699357, - "learning_rate": 5.64846864475237e-07, - "loss": 0.0, - "num_tokens": 2606274.0, - "reward": 3.9422970056533813, - "reward_std": 0.08354081520810723, - "rewards/coherence_reward_func/mean": 0.9850000023841858, - "rewards/coherence_reward_func/std": 0.017320507764816286, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.457297110557556, - "rewards/quality_reward_func/std": 0.0669589244760573, - "step": 2010 - }, - { - "completion_length": 30.3, - "completions/clipped_ratio": 0.0, - "completions/max_length": 30.3, - "completions/max_terminated_length": 30.3, - "completions/mean_length": 27.3, - "completions/mean_terminated_length": 27.3, - "completions/min_length": 24.8, - "completions/min_terminated_length": 24.8, - "epoch": 0.012243759925325186, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.26219385862350464, - "kl": 0.7765242636203766, - "learning_rate": 5.429364805754758e-07, - "loss": 0.0, - "num_tokens": 2619306.0, - "reward": 3.6878623008728026, - "reward_std": 0.1136812336742878, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.187862229347229, - "rewards/quality_reward_func/std": 0.11368124671280384, - "step": 2020 - }, - { - "completion_length": 32.4, - "completions/clipped_ratio": 0.0, - "completions/max_length": 32.4, - "completions/max_terminated_length": 32.4, - "completions/mean_length": 30.525, - "completions/mean_terminated_length": 30.525, - "completions/min_length": 28.6, - "completions/min_terminated_length": 28.6, - "epoch": 0.012304372598222837, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.2785656154155731, - "kl": 0.7394205266609788, - "learning_rate": 5.214076300865359e-07, - "loss": 0.0, - "num_tokens": 2630359.0, - "reward": 3.687705874443054, - "reward_std": 0.15183188505470752, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2177058696746825, - "rewards/quality_reward_func/std": 0.15183186950162053, - "step": 2030 - }, - { - "completion_length": 37.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 37.1, - "completions/max_terminated_length": 37.1, - "completions/mean_length": 33.25, - "completions/mean_terminated_length": 33.25, - "completions/min_length": 31.2, - "completions/min_terminated_length": 31.2, - "epoch": 0.012364985271120486, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.0, - "kl": 0.8439843498170376, - "learning_rate": 5.002645101004766e-07, - "loss": 0.0001, - "num_tokens": 2645513.0, - "reward": 3.825087642669678, - "reward_std": 0.06591251920908689, - "rewards/coherence_reward_func/mean": 0.9399999976158142, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.385087686777115, - "rewards/quality_reward_func/std": 0.06591251576319337, - "step": 2040 - }, - { - "completion_length": 43.3, - "completions/clipped_ratio": 0.0, - "completions/max_length": 43.3, - "completions/max_terminated_length": 43.3, - "completions/mean_length": 37.95, - "completions/mean_terminated_length": 37.95, - "completions/min_length": 34.7, - "completions/min_terminated_length": 34.7, - "epoch": 0.012425597944018135, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.0, - "kl": 0.6198632460087538, - "learning_rate": 4.795112425104323e-07, - "loss": 0.0001, - "num_tokens": 2657823.0, - "reward": 4.004547929763794, - "reward_std": 0.13050218414282427, - "rewards/coherence_reward_func/mean": 0.9925000011920929, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.5120479702949523, - "rewards/quality_reward_func/std": 0.14299601692473515, - "step": 2050 - }, - { - "completion_length": 37.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 37.0, - "completions/max_terminated_length": 37.0, - "completions/mean_length": 34.425, - "completions/mean_terminated_length": 34.425, - "completions/min_length": 33.3, - "completions/min_terminated_length": 33.3, - "epoch": 0.012486210616915785, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.25881993770599365, - "kl": 0.7347626067698002, - "learning_rate": 4.591518732070402e-07, - "loss": 0.0, - "num_tokens": 2671572.0, - "reward": 3.8481714487075807, - "reward_std": 0.08668275643140078, - "rewards/coherence_reward_func/mean": 0.9400000005960465, - "rewards/coherence_reward_func/std": 0.019999998807907104, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4081713795661925, - "rewards/quality_reward_func/std": 0.0666827370179817, - "step": 2060 - }, - { - "completion_length": 33.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 33.1, - "completions/max_terminated_length": 33.1, - "completions/mean_length": 31.4, - "completions/mean_terminated_length": 31.4, - "completions/min_length": 29.6, - "completions/min_terminated_length": 29.6, - "epoch": 0.012546823289813434, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.15212668478488922, - "kl": 0.7098456278443337, - "learning_rate": 4.391903712896861e-07, - "loss": 0.0, - "num_tokens": 2684532.0, - "reward": 3.8694933652877808, - "reward_std": 0.1730513483285904, - "rewards/coherence_reward_func/mean": 0.9399999976158142, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4294934183359147, - "rewards/quality_reward_func/std": 0.17305134031921626, - "step": 2070 - }, - { - "completion_length": 38.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 38.1, - "completions/max_terminated_length": 38.1, - "completions/mean_length": 34.55, - "completions/mean_terminated_length": 34.55, - "completions/min_length": 31.6, - "completions/min_terminated_length": 31.6, - "epoch": 0.012607435962711083, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.23879791796207428, - "kl": 0.5162452284246684, - "learning_rate": 4.196306282927187e-07, - "loss": 0.0, - "num_tokens": 2698226.0, - "reward": 4.148700022697449, - "reward_std": 0.09946945682168007, - "rewards/coherence_reward_func/mean": 0.9850000023841858, - "rewards/coherence_reward_func/std": 0.017320507764816286, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.6636998653411865, - "rewards/quality_reward_func/std": 0.11183752566576004, - "step": 2080 - }, - { - "completion_length": 39.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.5, - "completions/max_terminated_length": 39.5, - "completions/mean_length": 38.3, - "completions/mean_terminated_length": 38.3, - "completions/min_length": 36.4, - "completions/min_terminated_length": 36.4, - "epoch": 0.012668048635608734, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.0, - "kl": 0.6332034353166819, - "learning_rate": 4.0047645742679275e-07, - "loss": 0.0, - "num_tokens": 2710274.0, - "reward": 3.7876162052154543, - "reward_std": 0.08663389394059777, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3176162585616111, - "rewards/quality_reward_func/std": 0.0866338367573917, - "step": 2090 - }, - { - "completion_length": 35.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 35.5, - "completions/max_terminated_length": 35.5, - "completions/mean_length": 33.975, - "completions/mean_terminated_length": 33.975, - "completions/min_length": 32.6, - "completions/min_terminated_length": 32.6, - "epoch": 0.012728661308506382, - "frac_reward_zero_std": 0.5, - "grad_norm": 0.3389434814453125, - "kl": 0.6692267213016748, - "learning_rate": 3.817315928354695e-07, - "loss": 0.0, - "num_tokens": 2721353.0, - "reward": 4.179599142074585, - "reward_std": 0.03971860965248197, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.6795991480350494, - "rewards/quality_reward_func/std": 0.03971858550794423, - "step": 2100 - }, - { - "completion_length": 42.4, - "completions/clipped_ratio": 0.0, - "completions/max_length": 42.4, - "completions/max_terminated_length": 42.4, - "completions/mean_length": 38.4, - "completions/mean_terminated_length": 38.4, - "completions/min_length": 35.7, - "completions/min_terminated_length": 35.7, - "epoch": 0.012789273981404031, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.3139670193195343, - "kl": 0.5426759473979473, - "learning_rate": 3.633996888672428e-07, - "loss": 0.0, - "num_tokens": 2731041.0, - "reward": 3.883969783782959, - "reward_std": 0.07852623909711838, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3839697897434236, - "rewards/quality_reward_func/std": 0.07852623444050551, - "step": 2110 - }, - { - "completion_length": 38.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 38.8, - "completions/max_terminated_length": 38.8, - "completions/mean_length": 37.425, - "completions/mean_terminated_length": 37.425, - "completions/min_length": 35.8, - "completions/min_terminated_length": 35.8, - "epoch": 0.012849886654301682, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.0, - "kl": 0.6792019728571177, - "learning_rate": 3.4548431936311275e-07, - "loss": 0.0, - "num_tokens": 2744682.0, - "reward": 3.9952262163162233, - "reward_std": 0.11697835708037019, - "rewards/coherence_reward_func/mean": 0.9899999976158143, - "rewards/coherence_reward_func/std": 0.020000000298023225, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.5052262544631958, - "rewards/quality_reward_func/std": 0.0984926023054868, - "step": 2120 - }, - { - "completion_length": 39.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.9, - "completions/max_terminated_length": 39.9, - "completions/mean_length": 37.5, - "completions/mean_terminated_length": 37.5, - "completions/min_length": 34.8, - "completions/min_terminated_length": 34.8, - "epoch": 0.01291049932719933, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.32901260256767273, - "kl": 0.48247018456459045, - "learning_rate": 3.2798897695986155e-07, - "loss": 0.0, - "num_tokens": 2756026.0, - "reward": 3.995416355133057, - "reward_std": 0.10950108729302883, - "rewards/coherence_reward_func/mean": 0.9774999976158142, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.517916452884674, - "rewards/quality_reward_func/std": 0.10595933198928834, - "step": 2130 - }, - { - "completion_length": 39.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.0, - "completions/max_terminated_length": 39.0, - "completions/mean_length": 37.4, - "completions/mean_terminated_length": 37.4, - "completions/min_length": 35.9, - "completions/min_terminated_length": 35.9, - "epoch": 0.01297111200009698, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.20092740654945374, - "kl": 0.7654679004102946, - "learning_rate": 3.1091707240915704e-07, - "loss": 0.0, - "num_tokens": 2766542.0, - "reward": 3.806930947303772, - "reward_std": 0.11522182798944414, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3069309890270233, - "rewards/quality_reward_func/std": 0.11522185266949236, - "step": 2140 - }, - { - "completion_length": 44.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 44.0, - "completions/max_terminated_length": 44.0, - "completions/mean_length": 38.825, - "completions/mean_terminated_length": 38.825, - "completions/min_length": 33.4, - "completions/min_terminated_length": 33.4, - "epoch": 0.01303172467299463, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.41246721148490906, - "kl": 0.67378119379282, - "learning_rate": 2.942719339126171e-07, - "loss": 0.0001, - "num_tokens": 2779907.0, - "reward": 3.873100018501282, - "reward_std": 0.12143403701484204, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3730999886989594, - "rewards/quality_reward_func/std": 0.12143404334783554, - "step": 2150 - }, - { - "completion_length": 31.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 31.6, - "completions/max_terminated_length": 31.6, - "completions/mean_length": 29.4, - "completions/mean_terminated_length": 29.4, - "completions/min_length": 27.8, - "completions/min_terminated_length": 27.8, - "epoch": 0.013092337345892279, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.3698104918003082, - "kl": 0.7937666192650795, - "learning_rate": 2.780568064729716e-07, - "loss": 0.0, - "num_tokens": 2790899.0, - "reward": 3.905963635444641, - "reward_std": 0.09404368782415987, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4059635937213897, - "rewards/quality_reward_func/std": 0.09404368307441473, - "step": 2160 - }, - { - "completion_length": 45.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 45.2, - "completions/max_terminated_length": 45.2, - "completions/mean_length": 40.65, - "completions/mean_terminated_length": 40.65, - "completions/min_length": 36.8, - "completions/min_terminated_length": 36.8, - "epoch": 0.013152950018789928, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.15966033935546875, - "kl": 0.5114851795136929, - "learning_rate": 2.622748512614437e-07, - "loss": 0.0, - "num_tokens": 2799669.0, - "reward": 3.864537310600281, - "reward_std": 0.09009792669676245, - "rewards/coherence_reward_func/mean": 0.9, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4645373225212097, - "rewards/quality_reward_func/std": 0.09009791477583348, - "step": 2170 - }, - { - "completion_length": 32.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 32.1, - "completions/max_terminated_length": 32.1, - "completions/mean_length": 31.125, - "completions/mean_terminated_length": 31.125, - "completions/min_length": 30.2, - "completions/min_terminated_length": 30.2, - "epoch": 0.013213562691687579, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.0, - "kl": 0.7952270476147533, - "learning_rate": 2.4692914500147185e-07, - "loss": 0.0, - "num_tokens": 2813930.0, - "reward": 3.750529146194458, - "reward_std": 0.028828978561796247, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.250529170036316, - "rewards/quality_reward_func/std": 0.028828999237157406, - "step": 2180 - }, - { - "completion_length": 32.3, - "completions/clipped_ratio": 0.0, - "completions/max_length": 32.3, - "completions/max_terminated_length": 32.3, - "completions/mean_length": 31.1, - "completions/mean_terminated_length": 31.1, - "completions/min_length": 30.4, - "completions/min_terminated_length": 30.4, - "epoch": 0.013274175364585227, - "frac_reward_zero_std": 0.6, - "grad_norm": 0.1689903289079666, - "kl": 0.7716862162575125, - "learning_rate": 2.320226793688979e-07, - "loss": 0.0, - "num_tokens": 2826210.0, - "reward": 3.8593007802963255, - "reward_std": 0.04759715981781483, - "rewards/coherence_reward_func/mean": 0.8500000059604644, - "rewards/coherence_reward_func/std": 0.019999998807907104, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.5093008399009704, - "rewards/quality_reward_func/std": 0.05894124172627926, - "step": 2190 - }, - { - "completion_length": 31.7, - "completions/clipped_ratio": 0.0, - "completions/max_length": 31.7, - "completions/max_terminated_length": 31.7, - "completions/mean_length": 29.925, - "completions/mean_terminated_length": 29.925, - "completions/min_length": 28.3, - "completions/min_terminated_length": 28.3, - "epoch": 0.013334788037482876, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.34756502509117126, - "kl": 0.6283991295844317, - "learning_rate": 2.1755836040873197e-07, - "loss": 0.0, - "num_tokens": 2839715.0, - "reward": 3.5187587976455688, - "reward_std": 0.11687652822583913, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.0487587675452232, - "rewards/quality_reward_func/std": 0.1168765788897872, - "step": 2200 - }, - { - "completion_length": 30.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 30.2, - "completions/max_terminated_length": 30.2, - "completions/mean_length": 29.225, - "completions/mean_terminated_length": 29.225, - "completions/min_length": 28.6, - "completions/min_terminated_length": 28.6, - "epoch": 0.013395400710380527, - "frac_reward_zero_std": 0.5, - "grad_norm": 0.0, - "kl": 0.7514891102910042, - "learning_rate": 2.0353900796861503e-07, - "loss": 0.0, - "num_tokens": 2855164.0, - "reward": 3.857961082458496, - "reward_std": 0.05938511043787002, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3879611253738404, - "rewards/quality_reward_func/std": 0.059385118260979655, - "step": 2210 - }, - { - "completion_length": 40.7, - "completions/clipped_ratio": 0.0, - "completions/max_length": 40.7, - "completions/max_terminated_length": 40.7, - "completions/mean_length": 36.425, - "completions/mean_terminated_length": 36.425, - "completions/min_length": 33.5, - "completions/min_terminated_length": 33.5, - "epoch": 0.013456013383278176, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.2538030743598938, - "kl": 0.5167060427367687, - "learning_rate": 1.8996735514908327e-07, - "loss": 0.0, - "num_tokens": 2865377.0, - "reward": 3.8268110036849974, - "reward_std": 0.1216883840970695, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3268110036849976, - "rewards/quality_reward_func/std": 0.12168835522606969, - "step": 2220 - }, - { - "completion_length": 41.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 41.6, - "completions/max_terminated_length": 41.6, - "completions/mean_length": 36.8, - "completions/mean_terminated_length": 36.8, - "completions/min_length": 30.3, - "completions/min_terminated_length": 30.3, - "epoch": 0.013516626056175825, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.0, - "kl": 0.48048161119222643, - "learning_rate": 1.7684604777074427e-07, - "loss": 0.0, - "num_tokens": 2876565.0, - "reward": 3.6783462047576903, - "reward_std": 0.11543877327349036, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.20834618806839, - "rewards/quality_reward_func/std": 0.11543877467047423, - "step": 2230 - }, - { - "completion_length": 35.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 35.8, - "completions/max_terminated_length": 35.8, - "completions/mean_length": 33.725, - "completions/mean_terminated_length": 33.725, - "completions/min_length": 31.9, - "completions/min_terminated_length": 31.9, - "epoch": 0.013577238729073475, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.23462697863578796, - "kl": 0.6923598662018776, - "learning_rate": 1.6417764385846996e-07, - "loss": 0.0, - "num_tokens": 2891306.0, - "reward": 3.821285128593445, - "reward_std": 0.05842281579971313, - "rewards/coherence_reward_func/mean": 0.9399999976158142, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3812851011753082, - "rewards/quality_reward_func/std": 0.05842278301715851, - "step": 2240 - }, - { - "completion_length": 37.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 37.8, - "completions/max_terminated_length": 37.8, - "completions/mean_length": 36.3, - "completions/mean_terminated_length": 36.3, - "completions/min_length": 34.4, - "completions/min_terminated_length": 34.4, - "epoch": 0.013637851401971124, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.2515592575073242, - "kl": 0.5378904946148395, - "learning_rate": 1.5196461314270438e-07, - "loss": 0.0, - "num_tokens": 2904270.0, - "reward": 3.744435358047485, - "reward_std": 0.09523950256407261, - "rewards/coherence_reward_func/mean": 0.9399999976158142, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3044353067874908, - "rewards/quality_reward_func/std": 0.09523947611451149, - "step": 2250 - }, - { - "completion_length": 46.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 46.0, - "completions/max_terminated_length": 46.0, - "completions/mean_length": 43.0, - "completions/mean_terminated_length": 43.0, - "completions/min_length": 40.6, - "completions/min_terminated_length": 40.6, - "epoch": 0.013698464074868773, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.24622289836406708, - "kl": 0.4540472894906998, - "learning_rate": 1.4020933657798385e-07, - "loss": 0.0, - "num_tokens": 2913530.0, - "reward": 3.8658218145370484, - "reward_std": 0.0776843567728065, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3658217906951904, - "rewards/quality_reward_func/std": 0.07768436941551045, - "step": 2260 - }, - { - "completion_length": 40.3, - "completions/clipped_ratio": 0.0, - "completions/max_length": 40.3, - "completions/max_terminated_length": 40.3, - "completions/mean_length": 37.35, - "completions/mean_terminated_length": 37.35, - "completions/min_length": 35.5, - "completions/min_terminated_length": 35.5, - "epoch": 0.013759076747766424, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.6461712121963501, - "kl": 0.7032709065824747, - "learning_rate": 1.2891410587876714e-07, - "loss": 0.0, - "num_tokens": 2927884.0, - "reward": 4.103630995750427, - "reward_std": 0.08582227125298232, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.6036309719085693, - "rewards/quality_reward_func/std": 0.0858222215436399, - "step": 2270 - }, - { - "completion_length": 37.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 37.6, - "completions/max_terminated_length": 37.6, - "completions/mean_length": 35.4, - "completions/mean_terminated_length": 35.4, - "completions/min_length": 32.2, - "completions/min_terminated_length": 32.2, - "epoch": 0.013819689420664073, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.0, - "kl": 0.6915909007191658, - "learning_rate": 1.180811230726589e-07, - "loss": 0.0, - "num_tokens": 2940584.0, - "reward": 3.6822080612182617, - "reward_std": 0.07480372041463852, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.1822080731391906, - "rewards/quality_reward_func/std": 0.07480371855199337, - "step": 2280 - }, - { - "completion_length": 49.3, - "completions/clipped_ratio": 0.0, - "completions/max_length": 49.3, - "completions/max_terminated_length": 49.3, - "completions/mean_length": 45.725, - "completions/mean_terminated_length": 45.725, - "completions/min_length": 41.8, - "completions/min_terminated_length": 41.8, - "epoch": 0.013880302093561721, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.0, - "kl": 0.5746395938098431, - "learning_rate": 1.0771250007112155e-07, - "loss": 0.0001, - "num_tokens": 2954605.0, - "reward": 3.910894179344177, - "reward_std": 0.1052317249123007, - "rewards/coherence_reward_func/mean": 0.9824999988079071, - "rewards/coherence_reward_func/std": 0.034999999403953555, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4283941566944123, - "rewards/quality_reward_func/std": 0.07679192265495657, - "step": 2290 - }, - { - "completion_length": 36.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 36.8, - "completions/max_terminated_length": 36.8, - "completions/mean_length": 33.775, - "completions/mean_terminated_length": 33.775, - "completions/min_length": 31.9, - "completions/min_terminated_length": 31.9, - "epoch": 0.013940914766459372, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.2086838185787201, - "kl": 0.6577423378825188, - "learning_rate": 9.781025825775392e-08, - "loss": 0.0, - "num_tokens": 2966524.0, - "reward": 3.8150172233581543, - "reward_std": 0.10937830447219313, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3150171637535095, - "rewards/quality_reward_func/std": 0.10937829324975609, - "step": 2300 - }, - { - "completion_length": 45.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 45.5, - "completions/max_terminated_length": 45.5, - "completions/mean_length": 42.975, - "completions/mean_terminated_length": 42.975, - "completions/min_length": 40.0, - "completions/min_terminated_length": 40.0, - "epoch": 0.014001527439357021, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.4614509642124176, - "kl": 0.6215152129530906, - "learning_rate": 8.837632809421681e-08, - "loss": 0.0001, - "num_tokens": 2982403.0, - "reward": 3.8810100078582765, - "reward_std": 0.12997806966304778, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.381009966135025, - "rewards/quality_reward_func/std": 0.1299780823290348, - "step": 2310 - }, - { - "completion_length": 41.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 41.1, - "completions/max_terminated_length": 41.1, - "completions/mean_length": 38.075, - "completions/mean_terminated_length": 38.075, - "completions/min_length": 34.2, - "completions/min_terminated_length": 34.2, - "epoch": 0.01406214011225467, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.5260451436042786, - "kl": 0.6803686216473579, - "learning_rate": 7.941254874388904e-08, - "loss": 0.0001, - "num_tokens": 2998814.0, - "reward": 4.007596254348755, - "reward_std": 0.11801522299647331, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.5075962781906127, - "rewards/quality_reward_func/std": 0.11801521815359592, - "step": 2320 - }, - { - "completion_length": 45.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 45.2, - "completions/max_terminated_length": 45.2, - "completions/mean_length": 40.4, - "completions/mean_terminated_length": 40.4, - "completions/min_length": 35.8, - "completions/min_terminated_length": 35.8, - "epoch": 0.01412275278515232, - "frac_reward_zero_std": 0.0, - "grad_norm": 0.3105284869670868, - "kl": 0.5939857248216868, - "learning_rate": 7.092066771331507e-08, - "loss": 0.0, - "num_tokens": 3009442.0, - "reward": 3.7938735485076904, - "reward_std": 0.15046655498445033, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2938735604286193, - "rewards/quality_reward_func/std": 0.15046654269099236, - "step": 2330 - }, - { - "completion_length": 42.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 42.6, - "completions/max_terminated_length": 42.6, - "completions/mean_length": 39.175, - "completions/mean_terminated_length": 39.175, - "completions/min_length": 35.8, - "completions/min_terminated_length": 35.8, - "epoch": 0.01418336545804997, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.2562156021595001, - "kl": 0.6030757449567318, - "learning_rate": 6.29023405115281e-08, - "loss": 0.0, - "num_tokens": 3022653.0, - "reward": 3.882105326652527, - "reward_std": 0.06679476830177009, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3821053087711335, - "rewards/quality_reward_func/std": 0.06679481281898916, - "step": 2340 - }, - { - "completion_length": 29.7, - "completions/clipped_ratio": 0.0, - "completions/max_length": 29.7, - "completions/max_terminated_length": 29.7, - "completions/mean_length": 28.4, - "completions/mean_terminated_length": 28.4, - "completions/min_length": 26.9, - "completions/min_terminated_length": 26.9, - "epoch": 0.014243978130947618, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.4871319830417633, - "kl": 0.9065874569118023, - "learning_rate": 5.535913032730295e-08, - "loss": 0.0001, - "num_tokens": 3034389.0, - "reward": 3.8068490505218504, - "reward_std": 0.13240624400787054, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3368490397930146, - "rewards/quality_reward_func/std": 0.13240620964206756, - "step": 2350 - }, - { - "completion_length": 33.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 33.2, - "completions/max_terminated_length": 33.2, - "completions/mean_length": 31.125, - "completions/mean_terminated_length": 31.125, - "completions/min_length": 29.3, - "completions/min_terminated_length": 29.3, - "epoch": 0.014304590803845269, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.13280260562896729, - "kl": 0.9531688548624515, - "learning_rate": 3.1739804233685528e-06, - "loss": 0.0001, - "num_tokens": 3048438.0, - "reward": 4.034874534606933, - "reward_std": 0.1207739002071321, - "rewards/coherence_reward_func/mean": 0.9774999976158142, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.5573744893074035, - "rewards/quality_reward_func/std": 0.11691368520259857, - "step": 2360 - }, - { - "completion_length": 37.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 37.9, - "completions/max_terminated_length": 37.9, - "completions/mean_length": 34.6, - "completions/mean_terminated_length": 34.6, - "completions/min_length": 29.7, - "completions/min_terminated_length": 29.7, - "epoch": 0.014365203476742918, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.14214658737182617, - "kl": 0.7467479955404996, - "learning_rate": 3.1571570582470307e-06, - "loss": 0.0, - "num_tokens": 3065962.0, - "reward": 3.5746789455413817, - "reward_std": 0.10260191285051405, - "rewards/coherence_reward_func/mean": 0.9725000023841858, - "rewards/coherence_reward_func/std": 0.03403429687023163, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.102178880572319, - "rewards/quality_reward_func/std": 0.12953687296248972, - "step": 2370 - }, - { - "completion_length": 33.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 33.2, - "completions/max_terminated_length": 33.2, - "completions/mean_length": 32.175, - "completions/mean_terminated_length": 32.175, - "completions/min_length": 31.1, - "completions/min_terminated_length": 31.1, - "epoch": 0.014425816149640567, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.0, - "kl": 0.66828937754035, - "learning_rate": 3.1403016642175993e-06, - "loss": 0.0, - "num_tokens": 3079737.0, - "reward": 3.8128685474395754, - "reward_std": 0.028522996790707113, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3428685158491134, - "rewards/quality_reward_func/std": 0.02852298943325877, - "step": 2380 - }, - { - "completion_length": 46.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 46.0, - "completions/max_terminated_length": 46.0, - "completions/mean_length": 43.675, - "completions/mean_terminated_length": 43.675, - "completions/min_length": 41.0, - "completions/min_terminated_length": 41.0, - "epoch": 0.014486428822538215, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.20936551690101624, - "kl": 0.5772987704724073, - "learning_rate": 3.123415062788385e-06, - "loss": 0.0, - "num_tokens": 3092392.0, - "reward": 4.032087278366089, - "reward_std": 0.08739078380167484, - "rewards/coherence_reward_func/mean": 0.9474999964237213, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.5845872640609742, - "rewards/quality_reward_func/std": 0.09230031631886959, - "step": 2390 - }, - { - "completion_length": 40.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 40.9, - "completions/max_terminated_length": 40.9, - "completions/mean_length": 36.225, - "completions/mean_terminated_length": 36.225, - "completions/min_length": 32.8, - "completions/min_terminated_length": 32.8, - "epoch": 0.014547041495435866, - "frac_reward_zero_std": 0.0, - "grad_norm": 0.19811180233955383, - "kl": 0.5662164811044932, - "learning_rate": 3.106498076988519e-06, - "loss": 0.0, - "num_tokens": 3104437.0, - "reward": 3.769843649864197, - "reward_std": 0.22745495121926068, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2698437094688415, - "rewards/quality_reward_func/std": 0.2274549625813961, - "step": 2400 - }, - { - "completion_length": 46.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 46.2, - "completions/max_terminated_length": 46.2, - "completions/mean_length": 41.15, - "completions/mean_terminated_length": 41.15, - "completions/min_length": 37.3, - "completions/min_terminated_length": 37.3, - "epoch": 0.014607654168333515, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.376869261264801, - "kl": 0.5972482226788998, - "learning_rate": 3.089551531328021e-06, - "loss": 0.0001, - "num_tokens": 3117803.0, - "reward": 3.681988263130188, - "reward_std": 0.19493986666202545, - "rewards/coherence_reward_func/mean": 0.9899999976158143, - "rewards/coherence_reward_func/std": 0.020000000298023225, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.1919882595539093, - "rewards/quality_reward_func/std": 0.17984362840652465, - "step": 2410 - }, - { - "completion_length": 47.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 47.9, - "completions/max_terminated_length": 47.9, - "completions/mean_length": 45.675, - "completions/mean_terminated_length": 45.675, - "completions/min_length": 42.9, - "completions/min_terminated_length": 42.9, - "epoch": 0.014668266841231164, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.19029180705547333, - "kl": 0.5135284159332514, - "learning_rate": 3.0725762517576197e-06, - "loss": 0.0, - "num_tokens": 3130486.0, - "reward": 3.933048152923584, - "reward_std": 0.05088914311490953, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4330482244491578, - "rewards/quality_reward_func/std": 0.050889137154445055, - "step": 2420 - }, - { - "completion_length": 40.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 40.6, - "completions/max_terminated_length": 40.6, - "completions/mean_length": 37.2, - "completions/mean_terminated_length": 37.2, - "completions/min_length": 33.6, - "completions/min_terminated_length": 33.6, - "epoch": 0.014728879514128814, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.17371055483818054, - "kl": 0.5341577146202325, - "learning_rate": 3.0555730656284917e-06, - "loss": 0.0, - "num_tokens": 3145170.0, - "reward": 3.8729047536849976, - "reward_std": 0.26118058804422617, - "rewards/coherence_reward_func/mean": 0.9449999988079071, - "rewards/coherence_reward_func/std": 0.05, - "rewards/formatting_reward_func/mean": 1.4625, - "rewards/formatting_reward_func/std": 0.075, - "rewards/quality_reward_func/mean": 1.465404713153839, - "rewards/quality_reward_func/std": 0.13618059642612934, - "step": 2430 - }, - { - "completion_length": 45.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 45.1, - "completions/max_terminated_length": 45.1, - "completions/mean_length": 41.275, - "completions/mean_terminated_length": 41.275, - "completions/min_length": 38.4, - "completions/min_terminated_length": 38.4, - "epoch": 0.014789492187026463, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.0, - "kl": 0.5556940101087093, - "learning_rate": 3.0385428016519413e-06, - "loss": 0.0, - "num_tokens": 3158737.0, - "reward": 3.77139093875885, - "reward_std": 0.10554739125072957, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3013909101486205, - "rewards/quality_reward_func/std": 0.10554737057536841, - "step": 2440 - }, - { - "completion_length": 45.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 45.2, - "completions/max_terminated_length": 45.2, - "completions/mean_length": 39.375, - "completions/mean_terminated_length": 39.375, - "completions/min_length": 35.2, - "completions/min_terminated_length": 35.2, - "epoch": 0.014850104859924112, - "frac_reward_zero_std": 0.0, - "grad_norm": 0.27828526496887207, - "kl": 0.4428048962727189, - "learning_rate": 3.02148628985901e-06, - "loss": 0.0, - "num_tokens": 3170780.0, - "reward": 3.7035243988037108, - "reward_std": 0.19730213293805718, - "rewards/coherence_reward_func/mean": 0.9925000011920929, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2110244035720825, - "rewards/quality_reward_func/std": 0.18937852047383785, - "step": 2450 - }, - { - "completion_length": 43.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 43.2, - "completions/max_terminated_length": 43.2, - "completions/mean_length": 39.45, - "completions/mean_terminated_length": 39.45, - "completions/min_length": 36.2, - "completions/min_terminated_length": 36.2, - "epoch": 0.014910717532821763, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.1852184385061264, - "kl": 0.5022999217733741, - "learning_rate": 3.0044043615600176e-06, - "loss": 0.0, - "num_tokens": 3185042.0, - "reward": 3.773235487937927, - "reward_std": 0.07152210185304284, - "rewards/coherence_reward_func/mean": 0.9850000023841858, - "rewards/coherence_reward_func/std": 0.017320507764816286, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2882355391979217, - "rewards/quality_reward_func/std": 0.08858798267319798, - "step": 2460 - }, - { - "completion_length": 46.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 46.9, - "completions/max_terminated_length": 46.9, - "completions/mean_length": 43.825, - "completions/mean_terminated_length": 43.825, - "completions/min_length": 39.7, - "completions/min_terminated_length": 39.7, - "epoch": 0.014971330205719412, - "frac_reward_zero_std": 0.0, - "grad_norm": 0.12899985909461975, - "kl": 0.40727804601192474, - "learning_rate": 2.9872978493040517e-06, - "loss": 0.0, - "num_tokens": 3199671.0, - "reward": 3.994355297088623, - "reward_std": 0.07916008960455656, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4943552970886231, - "rewards/quality_reward_func/std": 0.0791601019911468, - "step": 2470 - }, - { - "completion_length": 41.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 41.2, - "completions/max_terminated_length": 41.2, - "completions/mean_length": 38.1, - "completions/mean_terminated_length": 38.1, - "completions/min_length": 34.9, - "completions/min_terminated_length": 34.9, - "epoch": 0.01503194287861706, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.1691986471414566, - "kl": 0.43959119068458674, - "learning_rate": 2.970167586838385e-06, - "loss": 0.0, - "num_tokens": 3210627.0, - "reward": 3.8784923553466797, - "reward_std": 0.06513606734806672, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3784924566745758, - "rewards/quality_reward_func/std": 0.06513608191162348, - "step": 2480 - }, - { - "completion_length": 42.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 42.8, - "completions/max_terminated_length": 42.8, - "completions/mean_length": 37.95, - "completions/mean_terminated_length": 37.95, - "completions/min_length": 34.5, - "completions/min_terminated_length": 34.5, - "epoch": 0.015092555551514711, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.16865229606628418, - "kl": 0.5600904650986195, - "learning_rate": 2.9530144090678435e-06, - "loss": 0.0, - "num_tokens": 3223301.0, - "reward": 3.7232309341430665, - "reward_std": 0.11923125218600035, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2232309579849243, - "rewards/quality_reward_func/std": 0.11923126853071153, - "step": 2490 - }, - { - "completion_length": 42.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 42.0, - "completions/max_terminated_length": 42.0, - "completions/mean_length": 39.325, - "completions/mean_terminated_length": 39.325, - "completions/min_length": 36.1, - "completions/min_terminated_length": 36.1, - "epoch": 0.01515316822441236, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.22221040725708008, - "kl": 0.6170519307255745, - "learning_rate": 2.935839152014112e-06, - "loss": 0.0, - "num_tokens": 3238534.0, - "reward": 3.781364846229553, - "reward_std": 0.11561555415391922, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3113647818565368, - "rewards/quality_reward_func/std": 0.11561555750668048, - "step": 2500 - }, - { - "completion_length": 46.4, - "completions/clipped_ratio": 0.0, - "completions/max_length": 46.4, - "completions/max_terminated_length": 46.4, - "completions/mean_length": 43.55, - "completions/mean_terminated_length": 43.55, - "completions/min_length": 40.5, - "completions/min_terminated_length": 40.5, - "epoch": 0.015213780897310009, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.21276995539665222, - "kl": 0.5121112320572138, - "learning_rate": 2.918642652774989e-06, - "loss": 0.0, - "num_tokens": 3249820.0, - "reward": 3.8134325742721558, - "reward_std": 0.0658552709966898, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3434325933456421, - "rewards/quality_reward_func/std": 0.06585524827241898, - "step": 2510 - }, - { - "completion_length": 44.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 44.5, - "completions/max_terminated_length": 44.5, - "completions/mean_length": 42.7, - "completions/mean_terminated_length": 42.7, - "completions/min_length": 41.3, - "completions/min_terminated_length": 41.3, - "epoch": 0.01527439357020766, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.3839207589626312, - "kl": 0.3808361187577248, - "learning_rate": 2.9014257494835863e-06, - "loss": 0.0, - "num_tokens": 3262784.0, - "reward": 3.908826160430908, - "reward_std": 0.08951527504250407, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4388261198997498, - "rewards/quality_reward_func/std": 0.08951530596241355, - "step": 2520 - }, - { - "completion_length": 42.3, - "completions/clipped_ratio": 0.0, - "completions/max_length": 42.3, - "completions/max_terminated_length": 42.3, - "completions/mean_length": 38.75, - "completions/mean_terminated_length": 38.75, - "completions/min_length": 36.3, - "completions/min_terminated_length": 36.3, - "epoch": 0.015335006243105308, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.2821073532104492, - "kl": 0.3947053180076182, - "learning_rate": 2.884189281267481e-06, - "loss": 0.0, - "num_tokens": 3275266.0, - "reward": 4.0385295152664185, - "reward_std": 0.10868711099028587, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.5385294795036315, - "rewards/quality_reward_func/std": 0.10868713408708572, - "step": 2530 - }, - { - "completion_length": 49.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 49.0, - "completions/max_terminated_length": 49.0, - "completions/mean_length": 45.3, - "completions/mean_terminated_length": 45.3, - "completions/min_length": 40.8, - "completions/min_terminated_length": 40.8, - "epoch": 0.015395618916002957, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.14954452216625214, - "kl": 0.568367613106966, - "learning_rate": 2.8669340882078166e-06, - "loss": 0.0001, - "num_tokens": 3289442.0, - "reward": 3.994439148902893, - "reward_std": 0.08471027053892613, - "rewards/coherence_reward_func/mean": 0.9099999964237213, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.5844391584396362, - "rewards/quality_reward_func/std": 0.08471028534695506, - "step": 2540 - }, - { - "completion_length": 39.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.9, - "completions/max_terminated_length": 39.9, - "completions/mean_length": 38.975, - "completions/mean_terminated_length": 38.975, - "completions/min_length": 37.8, - "completions/min_terminated_length": 37.8, - "epoch": 0.015456231588900608, - "frac_reward_zero_std": 0.5, - "grad_norm": 0.0, - "kl": 0.5600289195775986, - "learning_rate": 2.8496610112983607e-06, - "loss": 0.0, - "num_tokens": 3303761.0, - "reward": 3.9027648687362673, - "reward_std": 0.05694897845387459, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4327647507190704, - "rewards/quality_reward_func/std": 0.05694894678890705, - "step": 2550 - }, - { - "completion_length": 45.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 45.1, - "completions/max_terminated_length": 45.1, - "completions/mean_length": 39.425, - "completions/mean_terminated_length": 39.425, - "completions/min_length": 36.0, - "completions/min_terminated_length": 36.0, - "epoch": 0.015516844261798257, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.17587892711162567, - "kl": 0.5447262158617378, - "learning_rate": 2.8323708924045112e-06, - "loss": 0.0, - "num_tokens": 3318370.0, - "reward": 3.9645920515060427, - "reward_std": 0.1159634368494153, - "rewards/coherence_reward_func/mean": 0.9824999988079071, - "rewards/coherence_reward_func/std": 0.034999999403953555, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.482092034816742, - "rewards/quality_reward_func/std": 0.08113080505281686, - "step": 2560 - }, - { - "completion_length": 49.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 49.6, - "completions/max_terminated_length": 49.6, - "completions/mean_length": 45.925, - "completions/mean_terminated_length": 45.925, - "completions/min_length": 42.2, - "completions/min_terminated_length": 42.2, - "epoch": 0.015577456934695906, - "frac_reward_zero_std": 0.0, - "grad_norm": 0.08882153779268265, - "kl": 0.7832069963216781, - "learning_rate": 2.8150645742222716e-06, - "loss": 0.0001, - "num_tokens": 3329487.0, - "reward": 3.984575343132019, - "reward_std": 0.07680719960480928, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4845752358436584, - "rewards/quality_reward_func/std": 0.07680717501789332, - "step": 2570 - }, - { - "completion_length": 39.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.0, - "completions/max_terminated_length": 39.0, - "completions/mean_length": 35.625, - "completions/mean_terminated_length": 35.625, - "completions/min_length": 32.2, - "completions/min_terminated_length": 32.2, - "epoch": 0.015638069607593556, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.42785578966140747, - "kl": 0.6773035958409309, - "learning_rate": 2.797742900237175e-06, - "loss": 0.0, - "num_tokens": 3341460.0, - "reward": 3.8624061584472655, - "reward_std": 0.09481968693435192, - "rewards/coherence_reward_func/mean": 0.9550000011920929, - "rewards/coherence_reward_func/std": 0.017320507764816286, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4074061989784241, - "rewards/quality_reward_func/std": 0.07804237883538008, - "step": 2580 - }, - { - "completion_length": 35.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 35.0, - "completions/max_terminated_length": 35.0, - "completions/mean_length": 33.425, - "completions/mean_terminated_length": 33.425, - "completions/min_length": 32.4, - "completions/min_terminated_length": 32.4, - "epoch": 0.015698682280491205, - "frac_reward_zero_std": 0.5, - "grad_norm": 0.0, - "kl": 0.6863901816308499, - "learning_rate": 2.7804067146831724e-06, - "loss": 0.0, - "num_tokens": 3354581.0, - "reward": 3.9445890188217163, - "reward_std": 0.029083981364965438, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4445890128612517, - "rewards/quality_reward_func/std": 0.029084013029932975, - "step": 2590 - }, - { - "completion_length": 42.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 42.5, - "completions/max_terminated_length": 42.5, - "completions/mean_length": 40.625, - "completions/mean_terminated_length": 40.625, - "completions/min_length": 39.1, - "completions/min_terminated_length": 39.1, - "epoch": 0.015759294953388854, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.2602120339870453, - "kl": 0.7010997839272022, - "learning_rate": 2.763056862501492e-06, - "loss": 0.0001, - "num_tokens": 3362930.0, - "reward": 4.203077960014343, - "reward_std": 0.05241375220939517, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.7330780267715453, - "rewards/quality_reward_func/std": 0.052413776610046627, - "step": 2600 - }, - { - "completion_length": 43.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 43.0, - "completions/max_terminated_length": 43.0, - "completions/mean_length": 40.05, - "completions/mean_terminated_length": 40.05, - "completions/min_length": 37.4, - "completions/min_terminated_length": 37.4, - "epoch": 0.015819907626286503, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.17323555052280426, - "kl": 0.7230022266507149, - "learning_rate": 2.7456941892994497e-06, - "loss": 0.0001, - "num_tokens": 3378688.0, - "reward": 3.926683044433594, - "reward_std": 0.047219998016953466, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4266830563545227, - "rewards/quality_reward_func/std": 0.04721996132284403, - "step": 2610 - }, - { - "completion_length": 37.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 37.1, - "completions/max_terminated_length": 37.1, - "completions/mean_length": 35.275, - "completions/mean_terminated_length": 35.275, - "completions/min_length": 33.5, - "completions/min_terminated_length": 33.5, - "epoch": 0.015880520299184152, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.27321839332580566, - "kl": 0.5179006870836019, - "learning_rate": 2.7283195413092444e-06, - "loss": 0.0, - "num_tokens": 3390719.0, - "reward": 3.8513688802719117, - "reward_std": 0.17389362622052432, - "rewards/coherence_reward_func/mean": 0.975, - "rewards/coherence_reward_func/std": 0.05, - "rewards/formatting_reward_func/mean": 1.4875, - "rewards/formatting_reward_func/std": 0.025, - "rewards/quality_reward_func/mean": 1.3888689264655114, - "rewards/quality_reward_func/std": 0.09925779986660928, - "step": 2620 - }, - { - "completion_length": 39.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.1, - "completions/max_terminated_length": 39.1, - "completions/mean_length": 36.675, - "completions/mean_terminated_length": 36.675, - "completions/min_length": 34.0, - "completions/min_terminated_length": 34.0, - "epoch": 0.015941132972081804, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.25069549679756165, - "kl": 0.7381820097565651, - "learning_rate": 2.7109337653467072e-06, - "loss": 0.0001, - "num_tokens": 3405230.0, - "reward": 3.8772063493728637, - "reward_std": 0.10762457083910704, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3772063851356506, - "rewards/quality_reward_func/std": 0.10762461256235838, - "step": 2630 - }, - { - "completion_length": 35.4, - "completions/clipped_ratio": 0.0, - "completions/max_length": 35.4, - "completions/max_terminated_length": 35.4, - "completions/mean_length": 33.6, - "completions/mean_terminated_length": 33.6, - "completions/min_length": 31.2, - "completions/min_terminated_length": 31.2, - "epoch": 0.016001745644979453, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.2519443929195404, - "kl": 1.0420048169791698, - "learning_rate": 2.6935377087700297e-06, - "loss": 0.0001, - "num_tokens": 3416838.0, - "reward": 3.91592059135437, - "reward_std": 0.06143229096196592, - "rewards/coherence_reward_func/mean": 0.9199999988079071, - "rewards/coherence_reward_func/std": 0.020000000298023225, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4959205716848374, - "rewards/quality_reward_func/std": 0.04624048583209515, - "step": 2640 - }, - { - "completion_length": 45.4, - "completions/clipped_ratio": 0.0, - "completions/max_length": 45.4, - "completions/max_terminated_length": 45.4, - "completions/mean_length": 42.575, - "completions/mean_terminated_length": 42.575, - "completions/min_length": 39.4, - "completions/min_terminated_length": 39.4, - "epoch": 0.016062358317877102, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.0, - "kl": 0.6294115573167801, - "learning_rate": 2.6761322194384676e-06, - "loss": 0.0001, - "num_tokens": 3430097.0, - "reward": 3.9316094875335694, - "reward_std": 0.09293769309297203, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4616094708442688, - "rewards/quality_reward_func/std": 0.09293768610805273, - "step": 2650 - }, - { - "completion_length": 43.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 43.1, - "completions/max_terminated_length": 43.1, - "completions/mean_length": 39.025, - "completions/mean_terminated_length": 39.025, - "completions/min_length": 36.0, - "completions/min_terminated_length": 36.0, - "epoch": 0.01612297099077475, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.0, - "kl": 0.7926542639732361, - "learning_rate": 2.6587181456710154e-06, - "loss": 0.0001, - "num_tokens": 3441302.0, - "reward": 3.769076681137085, - "reward_std": 0.03707691185409203, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2690767049789429, - "rewards/quality_reward_func/std": 0.03707688362337649, - "step": 2660 - }, - { - "completion_length": 42.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 42.0, - "completions/max_terminated_length": 42.0, - "completions/mean_length": 39.775, - "completions/mean_terminated_length": 39.775, - "completions/min_length": 36.7, - "completions/min_terminated_length": 36.7, - "epoch": 0.0161835836636724, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.4314729869365692, - "kl": 0.7939471632242203, - "learning_rate": 2.641296336205062e-06, - "loss": 0.0001, - "num_tokens": 3455385.0, - "reward": 3.7821798801422117, - "reward_std": 0.104853530600667, - "rewards/coherence_reward_func/mean": 0.9925000011920929, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.289679890871048, - "rewards/quality_reward_func/std": 0.09481636472046376, - "step": 2670 - }, - { - "completion_length": 37.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 37.0, - "completions/max_terminated_length": 37.0, - "completions/mean_length": 35.475, - "completions/mean_terminated_length": 35.475, - "completions/min_length": 34.2, - "completions/min_terminated_length": 34.2, - "epoch": 0.01624419633657005, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.33191776275634766, - "kl": 0.5365656912326813, - "learning_rate": 2.6238676401550205e-06, - "loss": 0.0, - "num_tokens": 3464440.0, - "reward": 4.1119883298873905, - "reward_std": 0.06921507436782122, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.641988343000412, - "rewards/quality_reward_func/std": 0.06921505890786647, - "step": 2680 - }, - { - "completion_length": 38.4, - "completions/clipped_ratio": 0.0, - "completions/max_length": 38.4, - "completions/max_terminated_length": 38.4, - "completions/mean_length": 37.25, - "completions/mean_terminated_length": 37.25, - "completions/min_length": 35.7, - "completions/min_terminated_length": 35.7, - "epoch": 0.0163048090094677, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.09902704507112503, - "kl": 0.6591603338718415, - "learning_rate": 2.6064329069709495e-06, - "loss": 0.0, - "num_tokens": 3479326.0, - "reward": 4.098807001113892, - "reward_std": 0.04632377550005913, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.5988069772720337, - "rewards/quality_reward_func/std": 0.046323776338249446, - "step": 2690 - }, - { - "completion_length": 37.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 37.2, - "completions/max_terminated_length": 37.2, - "completions/mean_length": 34.825, - "completions/mean_terminated_length": 34.825, - "completions/min_length": 32.0, - "completions/min_terminated_length": 32.0, - "epoch": 0.01636542168236535, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.0, - "kl": 0.7738334469497203, - "learning_rate": 2.5889929863971465e-06, - "loss": 0.0001, - "num_tokens": 3495171.0, - "reward": 3.901769185066223, - "reward_std": 0.07665818370878696, - "rewards/coherence_reward_func/mean": 0.9925000011920929, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.409269118309021, - "rewards/quality_reward_func/std": 0.0794066557660699, - "step": 2700 - }, - { - "completion_length": 45.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 45.6, - "completions/max_terminated_length": 45.6, - "completions/mean_length": 40.875, - "completions/mean_terminated_length": 40.875, - "completions/min_length": 34.5, - "completions/min_terminated_length": 34.5, - "epoch": 0.016426034355263, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.0, - "kl": 0.5373141642659902, - "learning_rate": 2.571548728430737e-06, - "loss": 0.0, - "num_tokens": 3507754.0, - "reward": 3.834697890281677, - "reward_std": 0.06779527999460697, - "rewards/coherence_reward_func/mean": 0.9600000023841858, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.374697893857956, - "rewards/quality_reward_func/std": 0.06779528856277466, - "step": 2710 - }, - { - "completion_length": 38.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 38.5, - "completions/max_terminated_length": 38.5, - "completions/mean_length": 37.7, - "completions/mean_terminated_length": 37.7, - "completions/min_length": 37.2, - "completions/min_terminated_length": 37.2, - "epoch": 0.016486647028160648, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.40601110458374023, - "kl": 0.8498925857245923, - "learning_rate": 2.5541009832802448e-06, - "loss": 0.0001, - "num_tokens": 3522894.0, - "reward": 4.105284094810486, - "reward_std": 0.04365409443853423, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.6052841067314148, - "rewards/quality_reward_func/std": 0.043654035578947516, - "step": 2720 - }, - { - "completion_length": 47.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 47.1, - "completions/max_terminated_length": 47.1, - "completions/mean_length": 42.1, - "completions/mean_terminated_length": 42.1, - "completions/min_length": 38.1, - "completions/min_terminated_length": 38.1, - "epoch": 0.016547259701058296, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.3551698923110962, - "kl": 0.7001808919012547, - "learning_rate": 2.536650601324152e-06, - "loss": 0.0001, - "num_tokens": 3534554.0, - "reward": 4.1095947265625, - "reward_std": 0.13604998160153628, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.6395947456359863, - "rewards/quality_reward_func/std": 0.13604997415095568, - "step": 2730 - }, - { - "completion_length": 34.3, - "completions/clipped_ratio": 0.0, - "completions/max_length": 34.3, - "completions/max_terminated_length": 34.3, - "completions/mean_length": 32.975, - "completions/mean_terminated_length": 32.975, - "completions/min_length": 31.3, - "completions/min_terminated_length": 31.3, - "epoch": 0.016607872373955945, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.13214418292045593, - "kl": 0.7773146666586399, - "learning_rate": 2.5191984330694576e-06, - "loss": 0.0001, - "num_tokens": 3546873.0, - "reward": 3.9706292390823363, - "reward_std": 0.06375699776108376, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4706292569637298, - "rewards/quality_reward_func/std": 0.06375699583441019, - "step": 2740 - }, - { - "completion_length": 30.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 30.9, - "completions/max_terminated_length": 30.9, - "completions/mean_length": 29.325, - "completions/mean_terminated_length": 29.325, - "completions/min_length": 28.0, - "completions/min_terminated_length": 28.0, - "epoch": 0.016668485046853598, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.0, - "kl": 0.811890983581543, - "learning_rate": 2.501745329110219e-06, - "loss": 0.0, - "num_tokens": 3560378.0, - "reward": 3.628898596763611, - "reward_std": 0.10628243210958317, - "rewards/coherence_reward_func/mean": 0.8399999976158142, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2888986378908158, - "rewards/quality_reward_func/std": 0.10628242962993681, - "step": 2750 - }, - { - "completion_length": 45.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 45.2, - "completions/max_terminated_length": 45.2, - "completions/mean_length": 41.5, - "completions/mean_terminated_length": 41.5, - "completions/min_length": 38.3, - "completions/min_terminated_length": 38.3, - "epoch": 0.016729097719751247, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.3302355110645294, - "kl": 0.6666261859238147, - "learning_rate": 2.484292140086103e-06, - "loss": 0.0001, - "num_tokens": 3575330.0, - "reward": 3.565551996231079, - "reward_std": 0.0728124035988003, - "rewards/coherence_reward_func/mean": 0.9824999988079071, - "rewards/coherence_reward_func/std": 0.034999999403953555, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.0830519676208497, - "rewards/quality_reward_func/std": 0.03782532922923565, - "step": 2760 - }, - { - "completion_length": 45.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 45.0, - "completions/max_terminated_length": 45.0, - "completions/mean_length": 42.75, - "completions/mean_terminated_length": 42.75, - "completions/min_length": 40.0, - "completions/min_terminated_length": 40.0, - "epoch": 0.016789710392648895, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.1726081371307373, - "kl": 0.6724840953946114, - "learning_rate": 2.4668397166409184e-06, - "loss": 0.0001, - "num_tokens": 3589156.0, - "reward": 3.8185994148254396, - "reward_std": 0.038681184966117145, - "rewards/coherence_reward_func/mean": 0.9599999964237214, - "rewards/coherence_reward_func/std": 0.020000000298023225, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.358599418401718, - "rewards/quality_reward_func/std": 0.044405206956434996, - "step": 2770 - }, - { - "completion_length": 38.7, - "completions/clipped_ratio": 0.0, - "completions/max_length": 38.7, - "completions/max_terminated_length": 38.7, - "completions/mean_length": 37.225, - "completions/mean_terminated_length": 37.225, - "completions/min_length": 35.8, - "completions/min_terminated_length": 35.8, - "epoch": 0.016850323065546544, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.13680297136306763, - "kl": 0.7626109138131142, - "learning_rate": 2.4493889093811624e-06, - "loss": 0.0001, - "num_tokens": 3599309.0, - "reward": 3.9401774168014527, - "reward_std": 0.04904340072534978, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4401773750782012, - "rewards/quality_reward_func/std": 0.049043377256020905, - "step": 2780 - }, - { - "completion_length": 53.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 53.2, - "completions/max_terminated_length": 53.2, - "completions/mean_length": 47.65, - "completions/mean_terminated_length": 47.65, - "completions/min_length": 41.4, - "completions/min_terminated_length": 41.4, - "epoch": 0.016910935738444193, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.16031678020954132, - "kl": 0.47936664558947084, - "learning_rate": 2.4319405688345614e-06, - "loss": 0.0001, - "num_tokens": 3610687.0, - "reward": 3.7313020706176756, - "reward_std": 0.17685027779079973, - "rewards/coherence_reward_func/mean": 0.9400000005960465, - "rewards/coherence_reward_func/std": 0.05000000149011612, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.291302090883255, - "rewards/quality_reward_func/std": 0.12713201008737088, - "step": 2790 - }, - { - "completion_length": 35.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 35.5, - "completions/max_terminated_length": 35.5, - "completions/mean_length": 34.0, - "completions/mean_terminated_length": 34.0, - "completions/min_length": 32.8, - "completions/min_terminated_length": 32.8, - "epoch": 0.016971548411341842, - "frac_reward_zero_std": 0.5, - "grad_norm": 0.1880660057067871, - "kl": 0.5243588045239449, - "learning_rate": 2.414495545408619e-06, - "loss": 0.0, - "num_tokens": 3623799.0, - "reward": 3.986675810813904, - "reward_std": 0.07717739315703512, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.5166758596897125, - "rewards/quality_reward_func/std": 0.07717741429805755, - "step": 2800 - }, - { - "completion_length": 47.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 47.0, - "completions/max_terminated_length": 47.0, - "completions/mean_length": 43.2, - "completions/mean_terminated_length": 43.2, - "completions/min_length": 38.7, - "completions/min_terminated_length": 38.7, - "epoch": 0.017032161084239494, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.33259984850883484, - "kl": 0.48235350362956525, - "learning_rate": 2.3970546893491637e-06, - "loss": 0.0, - "num_tokens": 3636703.0, - "reward": 3.8235735654830934, - "reward_std": 0.10343162054196, - "rewards/coherence_reward_func/mean": 0.9774999976158142, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3460735499858856, - "rewards/quality_reward_func/std": 0.10343165006488561, - "step": 2810 - }, - { - "completion_length": 34.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 34.9, - "completions/max_terminated_length": 34.9, - "completions/mean_length": 33.475, - "completions/mean_terminated_length": 33.475, - "completions/min_length": 31.1, - "completions/min_terminated_length": 31.1, - "epoch": 0.017092773757137143, - "frac_reward_zero_std": 0.0, - "grad_norm": 0.17236274480819702, - "kl": 0.6768838051706553, - "learning_rate": 2.3796188506989153e-06, - "loss": 0.0, - "num_tokens": 3649826.0, - "reward": 3.8158772706985475, - "reward_std": 0.06938843043753877, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3458772778511048, - "rewards/quality_reward_func/std": 0.0693884058156982, - "step": 2820 - }, - { - "completion_length": 41.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 41.8, - "completions/max_terminated_length": 41.8, - "completions/mean_length": 38.125, - "completions/mean_terminated_length": 38.125, - "completions/min_length": 35.2, - "completions/min_terminated_length": 35.2, - "epoch": 0.017153386430034792, - "frac_reward_zero_std": 0.5, - "grad_norm": 0.23252420127391815, - "kl": 0.8006950195878744, - "learning_rate": 2.3621888792560517e-06, - "loss": 0.0001, - "num_tokens": 3661951.0, - "reward": 3.8711918115615847, - "reward_std": 0.06124148964881897, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4011918485164643, - "rewards/quality_reward_func/std": 0.06124151721596718, - "step": 2830 - }, - { - "completion_length": 40.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 40.1, - "completions/max_terminated_length": 40.1, - "completions/mean_length": 37.4, - "completions/mean_terminated_length": 37.4, - "completions/min_length": 35.7, - "completions/min_terminated_length": 35.7, - "epoch": 0.01721399910293244, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.15335425734519958, - "kl": 0.7628875311464072, - "learning_rate": 2.3447656245327903e-06, - "loss": 0.0001, - "num_tokens": 3674359.0, - "reward": 3.9296700239181517, - "reward_std": 0.053428084636107084, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4296700298786162, - "rewards/quality_reward_func/std": 0.05342807814013213, - "step": 2840 - }, - { - "completion_length": 54.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 54.1, - "completions/max_terminated_length": 54.1, - "completions/mean_length": 47.575, - "completions/mean_terminated_length": 47.575, - "completions/min_length": 42.9, - "completions/min_terminated_length": 42.9, - "epoch": 0.01727461177583009, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.0, - "kl": 0.44681017491966485, - "learning_rate": 2.327349935713986e-06, - "loss": 0.0, - "num_tokens": 3684542.0, - "reward": 3.7153425455093383, - "reward_std": 0.1435070670908317, - "rewards/coherence_reward_func/mean": 0.9424999952316284, - "rewards/coherence_reward_func/std": 0.08500000089406967, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.272842526435852, - "rewards/quality_reward_func/std": 0.0679278950439766, - "step": 2850 - }, - { - "completion_length": 39.3, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.3, - "completions/max_terminated_length": 39.3, - "completions/mean_length": 37.45, - "completions/mean_terminated_length": 37.45, - "completions/min_length": 35.4, - "completions/min_terminated_length": 35.4, - "epoch": 0.01733522444872774, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.3788013458251953, - "kl": 0.6096105173230171, - "learning_rate": 2.309942661615742e-06, - "loss": 0.0, - "num_tokens": 3696104.0, - "reward": 4.1152328729629515, - "reward_std": 0.07895004339516162, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.6152329444885254, - "rewards/quality_reward_func/std": 0.07895006146281958, - "step": 2860 - }, - { - "completion_length": 37.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 37.2, - "completions/max_terminated_length": 37.2, - "completions/mean_length": 36.325, - "completions/mean_terminated_length": 36.325, - "completions/min_length": 34.7, - "completions/min_terminated_length": 34.7, - "epoch": 0.01739583712162539, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.2079620063304901, - "kl": 0.6374891117215157, - "learning_rate": 2.2925446506440403e-06, - "loss": 0.0, - "num_tokens": 3709281.0, - "reward": 3.604074263572693, - "reward_std": 0.09577681496739388, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.13407421708107, - "rewards/quality_reward_func/std": 0.09577685054391623, - "step": 2870 - }, - { - "completion_length": 44.4, - "completions/clipped_ratio": 0.0, - "completions/max_length": 44.4, - "completions/max_terminated_length": 44.4, - "completions/mean_length": 41.075, - "completions/mean_terminated_length": 41.075, - "completions/min_length": 37.8, - "completions/min_terminated_length": 37.8, - "epoch": 0.01745644979452304, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.0, - "kl": 0.6280573938041926, - "learning_rate": 2.2751567507533908e-06, - "loss": 0.0, - "num_tokens": 3720536.0, - "reward": 3.911236310005188, - "reward_std": 0.09300461623352022, - "rewards/coherence_reward_func/mean": 0.9774999976158142, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.433736264705658, - "rewards/quality_reward_func/std": 0.08958076136186718, - "step": 2880 - }, - { - "completion_length": 44.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 44.1, - "completions/max_terminated_length": 44.1, - "completions/mean_length": 42.375, - "completions/mean_terminated_length": 42.375, - "completions/min_length": 40.1, - "completions/min_terminated_length": 40.1, - "epoch": 0.01751706246742069, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.0, - "kl": 0.65657629519701, - "learning_rate": 2.2577798094055028e-06, - "loss": 0.0001, - "num_tokens": 3734043.0, - "reward": 3.9169987440109253, - "reward_std": 0.09950992427766323, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4169987499713899, - "rewards/quality_reward_func/std": 0.09950996097177267, - "step": 2890 - }, - { - "completion_length": 44.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 44.6, - "completions/max_terminated_length": 44.6, - "completions/mean_length": 40.075, - "completions/mean_terminated_length": 40.075, - "completions/min_length": 37.0, - "completions/min_terminated_length": 37.0, - "epoch": 0.017577675140318338, - "frac_reward_zero_std": 0.5, - "grad_norm": 0.19218558073043823, - "kl": 0.4676514953374863, - "learning_rate": 2.2404146735279823e-06, - "loss": 0.0, - "num_tokens": 3748242.0, - "reward": 3.964124250411987, - "reward_std": 0.06019405350089073, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4941242456436157, - "rewards/quality_reward_func/std": 0.060194038599729535, - "step": 2900 - }, - { - "completion_length": 31.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 31.9, - "completions/max_terminated_length": 31.9, - "completions/mean_length": 31.75, - "completions/mean_terminated_length": 31.75, - "completions/min_length": 31.4, - "completions/min_terminated_length": 31.4, - "epoch": 0.017638287813215987, - "frac_reward_zero_std": 0.5, - "grad_norm": 0.0, - "kl": 0.6983957252814434, - "learning_rate": 2.223062189473054e-06, - "loss": 0.0, - "num_tokens": 3760236.0, - "reward": 3.952869749069214, - "reward_std": 0.014002268150215968, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4528697073459624, - "rewards/quality_reward_func/std": 0.0140022435458377, - "step": 2910 - }, - { - "completion_length": 41.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 41.6, - "completions/max_terminated_length": 41.6, - "completions/mean_length": 39.8, - "completions/mean_terminated_length": 39.8, - "completions/min_length": 37.4, - "completions/min_terminated_length": 37.4, - "epoch": 0.017698900486113636, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.18282218277454376, - "kl": 0.4654235543683171, - "learning_rate": 2.2057232029763092e-06, - "loss": 0.0, - "num_tokens": 3770848.0, - "reward": 3.6577728748321534, - "reward_std": 0.0783234752714634, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.187772911787033, - "rewards/quality_reward_func/std": 0.07832347678486258, - "step": 2920 - }, - { - "completion_length": 33.3, - "completions/clipped_ratio": 0.0, - "completions/max_length": 33.3, - "completions/max_terminated_length": 33.3, - "completions/mean_length": 31.725, - "completions/mean_terminated_length": 31.725, - "completions/min_length": 30.7, - "completions/min_terminated_length": 30.7, - "epoch": 0.017759513159011284, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.1789316087961197, - "kl": 0.6979955821298063, - "learning_rate": 2.188398559115489e-06, - "loss": 0.0, - "num_tokens": 3783945.0, - "reward": 3.916050124168396, - "reward_std": 0.14435958303511143, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4160500228405, - "rewards/quality_reward_func/std": 0.14435954354703426, - "step": 2930 - }, - { - "completion_length": 40.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 40.8, - "completions/max_terminated_length": 40.8, - "completions/mean_length": 38.725, - "completions/mean_terminated_length": 38.725, - "completions/min_length": 37.0, - "completions/min_terminated_length": 37.0, - "epoch": 0.017820125831908937, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.27817022800445557, - "kl": 0.5846670583821834, - "learning_rate": 2.171089102269294e-06, - "loss": 0.0, - "num_tokens": 3796002.0, - "reward": 3.9199183702468874, - "reward_std": 0.07633975064381956, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4499184250831605, - "rewards/quality_reward_func/std": 0.07633973751217127, - "step": 2940 - }, - { - "completion_length": 46.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 46.5, - "completions/max_terminated_length": 46.5, - "completions/mean_length": 41.75, - "completions/mean_terminated_length": 41.75, - "completions/min_length": 38.2, - "completions/min_terminated_length": 38.2, - "epoch": 0.017880738504806586, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.0, - "kl": 0.6097869090735912, - "learning_rate": 2.1537956760762296e-06, - "loss": 0.0, - "num_tokens": 3806716.0, - "reward": 3.9926924228668215, - "reward_std": 0.11543759661726653, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4926924526691436, - "rewards/quality_reward_func/std": 0.11543761247303337, - "step": 2950 - }, - { - "completion_length": 40.7, - "completions/clipped_ratio": 0.0, - "completions/max_length": 40.7, - "completions/max_terminated_length": 40.7, - "completions/mean_length": 38.825, - "completions/mean_terminated_length": 38.825, - "completions/min_length": 35.9, - "completions/min_terminated_length": 35.9, - "epoch": 0.017941351177704234, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.11390696465969086, - "kl": 0.6256432753056288, - "learning_rate": 2.136519123393493e-06, - "loss": 0.0, - "num_tokens": 3820641.0, - "reward": 4.011971044540405, - "reward_std": 0.08256426015868783, - "rewards/coherence_reward_func/mean": 0.9600000023841858, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.5519711196422576, - "rewards/quality_reward_func/std": 0.08256429834291339, - "step": 2960 - }, - { - "completion_length": 43.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 43.8, - "completions/max_terminated_length": 43.8, - "completions/mean_length": 38.875, - "completions/mean_terminated_length": 38.875, - "completions/min_length": 35.0, - "completions/min_terminated_length": 35.0, - "epoch": 0.018001963850601883, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.35060346126556396, - "kl": 0.6076034534722566, - "learning_rate": 2.1192602862558864e-06, - "loss": 0.0, - "num_tokens": 3833028.0, - "reward": 3.6884315967559815, - "reward_std": 0.12104190215468406, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.188431590795517, - "rewards/quality_reward_func/std": 0.1210419088602066, - "step": 2970 - }, - { - "completion_length": 52.4, - "completions/clipped_ratio": 0.0, - "completions/max_length": 52.4, - "completions/max_terminated_length": 52.4, - "completions/mean_length": 45.85, - "completions/mean_terminated_length": 45.85, - "completions/min_length": 39.3, - "completions/min_terminated_length": 39.3, - "epoch": 0.018062576523499532, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.3901742398738861, - "kl": 0.5164080902934074, - "learning_rate": 2.1020200058347836e-06, - "loss": 0.0, - "num_tokens": 3845054.0, - "reward": 3.76722936630249, - "reward_std": 0.08552623242139816, - "rewards/coherence_reward_func/mean": 0.9399999976158142, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3272293865680695, - "rewards/quality_reward_func/std": 0.08552621733397245, - "step": 2980 - }, - { - "completion_length": 51.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 51.5, - "completions/max_terminated_length": 51.5, - "completions/mean_length": 47.825, - "completions/mean_terminated_length": 47.825, - "completions/min_length": 43.5, - "completions/min_terminated_length": 43.5, - "epoch": 0.01812318919639718, - "frac_reward_zero_std": 0.0, - "grad_norm": 0.5072823166847229, - "kl": 0.5146443182602525, - "learning_rate": 2.0847991223971305e-06, - "loss": 0.0, - "num_tokens": 3856715.0, - "reward": 3.7614234685897827, - "reward_std": 0.16738499663770198, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2914234220981597, - "rewards/quality_reward_func/std": 0.16738500162027775, - "step": 2990 - }, - { - "completion_length": 48.7, - "completions/clipped_ratio": 0.0, - "completions/max_length": 48.7, - "completions/max_terminated_length": 48.7, - "completions/mean_length": 42.675, - "completions/mean_terminated_length": 42.675, - "completions/min_length": 39.2, - "completions/min_terminated_length": 39.2, - "epoch": 0.018183801869294833, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.23874402046203613, - "kl": 0.4912438288331032, - "learning_rate": 2.067598475264491e-06, - "loss": 0.0, - "num_tokens": 3869474.0, - "reward": 4.022751498222351, - "reward_std": 0.03978296392597258, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.5227514982223511, - "rewards/quality_reward_func/std": 0.039782968908548356, - "step": 3000 - }, - { - "completion_length": 39.3, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.3, - "completions/max_terminated_length": 39.3, - "completions/mean_length": 37.625, - "completions/mean_terminated_length": 37.625, - "completions/min_length": 36.0, - "completions/min_terminated_length": 36.0, - "epoch": 0.018244414542192482, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.17149239778518677, - "kl": 0.7874499909579754, - "learning_rate": 2.0504189027721396e-06, - "loss": 0.0001, - "num_tokens": 3883319.0, - "reward": 3.956950831413269, - "reward_std": 0.05743742329068482, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4569508492946626, - "rewards/quality_reward_func/std": 0.057437406154349446, - "step": 3010 - }, - { - "completion_length": 46.7, - "completions/clipped_ratio": 0.0, - "completions/max_length": 46.7, - "completions/max_terminated_length": 46.7, - "completions/mean_length": 44.225, - "completions/mean_terminated_length": 44.225, - "completions/min_length": 41.6, - "completions/min_terminated_length": 41.6, - "epoch": 0.01830502721509013, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.43216586112976074, - "kl": 0.8041438281536102, - "learning_rate": 2.033261242228203e-06, - "loss": 0.0001, - "num_tokens": 3898256.0, - "reward": 4.001222062110901, - "reward_std": 0.30263486690819263, - "rewards/coherence_reward_func/mean": 0.9224999964237213, - "rewards/coherence_reward_func/std": 0.06500000208616256, - "rewards/formatting_reward_func/mean": 1.4625, - "rewards/formatting_reward_func/std": 0.075, - "rewards/quality_reward_func/mean": 1.61622211933136, - "rewards/quality_reward_func/std": 0.16852968987077474, - "step": 3020 - }, - { - "completion_length": 41.3, - "completions/clipped_ratio": 0.0, - "completions/max_length": 41.3, - "completions/max_terminated_length": 41.3, - "completions/mean_length": 37.675, - "completions/mean_terminated_length": 37.675, - "completions/min_length": 34.7, - "completions/min_terminated_length": 34.7, - "epoch": 0.01836563988798778, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.18524278700351715, - "kl": 0.5940576240420341, - "learning_rate": 2.0161263298728494e-06, - "loss": 0.0, - "num_tokens": 3908251.0, - "reward": 3.921669435501099, - "reward_std": 0.08791780807077884, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4216694355010986, - "rewards/quality_reward_func/std": 0.08791773918783292, - "step": 3030 - }, - { - "completion_length": 38.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 38.0, - "completions/max_terminated_length": 38.0, - "completions/mean_length": 35.875, - "completions/mean_terminated_length": 35.875, - "completions/min_length": 33.8, - "completions/min_terminated_length": 33.8, - "epoch": 0.01842625256088543, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.2265503704547882, - "kl": 0.8165366999804974, - "learning_rate": 1.9990150008375348e-06, - "loss": 0.0001, - "num_tokens": 3920502.0, - "reward": 3.9495574712753294, - "reward_std": 0.055046566482633355, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4495575547218322, - "rewards/quality_reward_func/std": 0.055046635866165164, - "step": 3040 - }, - { - "completion_length": 35.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 35.0, - "completions/max_terminated_length": 35.0, - "completions/mean_length": 33.275, - "completions/mean_terminated_length": 33.275, - "completions/min_length": 30.8, - "completions/min_terminated_length": 30.8, - "epoch": 0.018486865233783078, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.1463969200849533, - "kl": 0.8312926575541496, - "learning_rate": 1.981928089104294e-06, - "loss": 0.0, - "num_tokens": 3933233.0, - "reward": 3.618661141395569, - "reward_std": 0.0848767876625061, - "rewards/coherence_reward_func/mean": 0.9399999976158142, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.17866108417511, - "rewards/quality_reward_func/std": 0.08487678728997708, - "step": 3050 - }, - { - "completion_length": 42.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 42.2, - "completions/max_terminated_length": 42.2, - "completions/mean_length": 38.825, - "completions/mean_terminated_length": 38.825, - "completions/min_length": 36.4, - "completions/min_terminated_length": 36.4, - "epoch": 0.01854747790668073, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.0, - "kl": 0.7010526616126299, - "learning_rate": 1.9648664274651e-06, - "loss": 0.0001, - "num_tokens": 3947194.0, - "reward": 3.801596736907959, - "reward_std": 0.07207065261900425, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3015966907143592, - "rewards/quality_reward_func/std": 0.07207060419023037, - "step": 3060 - }, - { - "completion_length": 38.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 38.5, - "completions/max_terminated_length": 38.5, - "completions/mean_length": 37.425, - "completions/mean_terminated_length": 37.425, - "completions/min_length": 35.5, - "completions/min_terminated_length": 35.5, - "epoch": 0.01860809057957838, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.17187735438346863, - "kl": 0.7273755632340908, - "learning_rate": 1.947830847481271e-06, - "loss": 0.0001, - "num_tokens": 3962491.0, - "reward": 3.861338996887207, - "reward_std": 0.06818611929193139, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3613389432430267, - "rewards/quality_reward_func/std": 0.06818618662655354, - "step": 3070 - }, - { - "completion_length": 35.7, - "completions/clipped_ratio": 0.0, - "completions/max_length": 35.7, - "completions/max_terminated_length": 35.7, - "completions/mean_length": 32.9, - "completions/mean_terminated_length": 32.9, - "completions/min_length": 30.9, - "completions/min_terminated_length": 30.9, - "epoch": 0.018668703252476028, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.16616536676883698, - "kl": 0.7653290726244449, - "learning_rate": 1.93082217944294e-06, - "loss": 0.0001, - "num_tokens": 3975243.0, - "reward": 3.9271071672439577, - "reward_std": 0.03739016959443688, - "rewards/coherence_reward_func/mean": 0.9850000023841858, - "rewards/coherence_reward_func/std": 0.017320507764816286, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4421072006225586, - "rewards/quality_reward_func/std": 0.05399639131501317, - "step": 3080 - }, - { - "completion_length": 40.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 40.2, - "completions/max_terminated_length": 40.2, - "completions/mean_length": 38.75, - "completions/mean_terminated_length": 38.75, - "completions/min_length": 36.4, - "completions/min_terminated_length": 36.4, - "epoch": 0.018729315925373677, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.0, - "kl": 0.7989107012748718, - "learning_rate": 1.9138412523285937e-06, - "loss": 0.0001, - "num_tokens": 3992245.0, - "reward": 3.9924602270126344, - "reward_std": 0.10027031376957893, - "rewards/coherence_reward_func/mean": 0.9925000011920929, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.49996018409729, - "rewards/quality_reward_func/std": 0.08873173072934151, - "step": 3090 - }, - { - "completion_length": 40.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 40.0, - "completions/max_terminated_length": 40.0, - "completions/mean_length": 37.675, - "completions/mean_terminated_length": 37.675, - "completions/min_length": 35.8, - "completions/min_terminated_length": 35.8, - "epoch": 0.018789928598271326, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.1242041140794754, - "kl": 0.7136042453348637, - "learning_rate": 1.8968888937646624e-06, - "loss": 0.0001, - "num_tokens": 4006452.0, - "reward": 3.7888214111328127, - "reward_std": 0.08842599894851447, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2888214111328125, - "rewards/quality_reward_func/std": 0.08842603755183517, - "step": 3100 - }, - { - "completion_length": 44.7, - "completions/clipped_ratio": 0.0, - "completions/max_length": 44.7, - "completions/max_terminated_length": 44.7, - "completions/mean_length": 41.075, - "completions/mean_terminated_length": 41.075, - "completions/min_length": 38.1, - "completions/min_terminated_length": 38.1, - "epoch": 0.018850541271168975, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.0, - "kl": 0.605262978747487, - "learning_rate": 1.879965929985187e-06, - "loss": 0.0, - "num_tokens": 4020499.0, - "reward": 3.834125018119812, - "reward_std": 0.12141236998140811, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3641250669956206, - "rewards/quality_reward_func/std": 0.12141232704743743, - "step": 3110 - }, - { - "completion_length": 43.4, - "completions/clipped_ratio": 0.0, - "completions/max_length": 43.4, - "completions/max_terminated_length": 43.4, - "completions/mean_length": 39.7, - "completions/mean_terminated_length": 39.7, - "completions/min_length": 36.8, - "completions/min_terminated_length": 36.8, - "epoch": 0.018911153944066627, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.26001664996147156, - "kl": 0.645774920284748, - "learning_rate": 1.8630731857915451e-06, - "loss": 0.0, - "num_tokens": 4034619.0, - "reward": 3.571972608566284, - "reward_std": 0.2117477380670607, - "rewards/coherence_reward_func/mean": 0.925, - "rewards/coherence_reward_func/std": 0.05, - "rewards/formatting_reward_func/mean": 1.4625, - "rewards/formatting_reward_func/std": 0.025, - "rewards/quality_reward_func/mean": 1.184472641348839, - "rewards/quality_reward_func/std": 0.13674773583188654, - "step": 3120 - }, - { - "completion_length": 40.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 40.9, - "completions/max_terminated_length": 40.9, - "completions/mean_length": 40.15, - "completions/mean_terminated_length": 40.15, - "completions/min_length": 39.1, - "completions/min_terminated_length": 39.1, - "epoch": 0.018971766616964276, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.0, - "kl": 0.6743199843913317, - "learning_rate": 1.8462114845122582e-06, - "loss": 0.0, - "num_tokens": 4045341.0, - "reward": 4.098270344734192, - "reward_std": 0.039933528192341326, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.6282702922821044, - "rewards/quality_reward_func/std": 0.03993358239531517, - "step": 3130 - }, - { - "completion_length": 42.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 42.6, - "completions/max_terminated_length": 42.6, - "completions/mean_length": 40.35, - "completions/mean_terminated_length": 40.35, - "completions/min_length": 38.3, - "completions/min_terminated_length": 38.3, - "epoch": 0.019032379289861925, - "frac_reward_zero_std": 0.0, - "grad_norm": 0.20205184817314148, - "kl": 0.5343499461188912, - "learning_rate": 1.8293816479628584e-06, - "loss": 0.0, - "num_tokens": 4057039.0, - "reward": 3.8032551288604735, - "reward_std": 0.08389510039705783, - "rewards/coherence_reward_func/mean": 0.95, - "rewards/coherence_reward_func/std": 0.02309400886297226, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3532551646232605, - "rewards/quality_reward_func/std": 0.0626519579673186, - "step": 3140 - }, - { - "completion_length": 45.4, - "completions/clipped_ratio": 0.0, - "completions/max_length": 45.4, - "completions/max_terminated_length": 45.4, - "completions/mean_length": 42.625, - "completions/mean_terminated_length": 42.625, - "completions/min_length": 39.5, - "completions/min_terminated_length": 39.5, - "epoch": 0.019092991962759574, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.24577243626117706, - "kl": 0.6044593669474125, - "learning_rate": 1.8125844964058354e-06, - "loss": 0.0001, - "num_tokens": 4072148.0, - "reward": 3.954366612434387, - "reward_std": 0.13101303055882454, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4843665599822997, - "rewards/quality_reward_func/std": 0.13101302906870843, - "step": 3150 - }, - { - "completion_length": 50.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 50.6, - "completions/max_terminated_length": 50.6, - "completions/mean_length": 47.1, - "completions/mean_terminated_length": 47.1, - "completions/min_length": 44.2, - "completions/min_terminated_length": 44.2, - "epoch": 0.019153604635657222, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.10859663039445877, - "kl": 0.654767077229917, - "learning_rate": 1.7958208485106586e-06, - "loss": 0.0001, - "num_tokens": 4086408.0, - "reward": 4.028897976875305, - "reward_std": 0.039625269593670964, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.5588978946208953, - "rewards/quality_reward_func/std": 0.03962532239966095, - "step": 3160 - }, - { - "completion_length": 43.7, - "completions/clipped_ratio": 0.0, - "completions/max_length": 43.7, - "completions/max_terminated_length": 43.7, - "completions/mean_length": 40.05, - "completions/mean_terminated_length": 40.05, - "completions/min_length": 34.6, - "completions/min_terminated_length": 34.6, - "epoch": 0.01921421730855487, - "frac_reward_zero_std": 0.0, - "grad_norm": 0.18048399686813354, - "kl": 0.6584017806686461, - "learning_rate": 1.7790915213138777e-06, - "loss": 0.0001, - "num_tokens": 4100390.0, - "reward": 3.640557956695557, - "reward_std": 0.11440884659532458, - "rewards/coherence_reward_func/mean": 0.9925000011920929, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.1480579733848573, - "rewards/quality_reward_func/std": 0.11514959074556827, - "step": 3170 - }, - { - "completion_length": 41.3, - "completions/clipped_ratio": 0.0, - "completions/max_length": 41.3, - "completions/max_terminated_length": 41.3, - "completions/mean_length": 39.625, - "completions/mean_terminated_length": 39.625, - "completions/min_length": 37.3, - "completions/min_terminated_length": 37.3, - "epoch": 0.019274829981452524, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.22853608429431915, - "kl": 0.5096729185432196, - "learning_rate": 1.7623973301792964e-06, - "loss": 0.0, - "num_tokens": 4112795.0, - "reward": 3.6959209203720094, - "reward_std": 0.09516287457663566, - "rewards/coherence_reward_func/mean": 0.8950000017881393, - "rewards/coherence_reward_func/std": 0.037320506572723386, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3009209588170052, - "rewards/quality_reward_func/std": 0.06619152534985914, - "step": 3180 - }, - { - "completion_length": 37.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 37.1, - "completions/max_terminated_length": 37.1, - "completions/mean_length": 35.175, - "completions/mean_terminated_length": 35.175, - "completions/min_length": 33.0, - "completions/min_terminated_length": 33.0, - "epoch": 0.019335442654350173, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.2885758578777313, - "kl": 1.076241620257497, - "learning_rate": 1.745739088758242e-06, - "loss": 0.0001, - "num_tokens": 4125142.0, - "reward": 3.7729340553283692, - "reward_std": 0.06683287937194109, - "rewards/coherence_reward_func/mean": 0.9925000011920929, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2804340660572051, - "rewards/quality_reward_func/std": 0.07385497200302779, - "step": 3190 - }, - { - "completion_length": 39.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.8, - "completions/max_terminated_length": 39.8, - "completions/mean_length": 37.675, - "completions/mean_terminated_length": 37.675, - "completions/min_length": 35.3, - "completions/min_terminated_length": 35.3, - "epoch": 0.01939605532724782, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.0, - "kl": 0.8483439676463604, - "learning_rate": 1.7291176089498969e-06, - "loss": 0.0001, - "num_tokens": 4136533.0, - "reward": 3.761235785484314, - "reward_std": 0.1376682033762336, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2612357795238496, - "rewards/quality_reward_func/std": 0.13766820412129163, - "step": 3200 - }, - { - "completion_length": 39.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.5, - "completions/max_terminated_length": 39.5, - "completions/mean_length": 37.45, - "completions/mean_terminated_length": 37.45, - "completions/min_length": 34.9, - "completions/min_terminated_length": 34.9, - "epoch": 0.01945666800014547, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.43329519033432007, - "kl": 0.8543856620788575, - "learning_rate": 1.7125337008617387e-06, - "loss": 0.0001, - "num_tokens": 4148683.0, - "reward": 4.058108305931091, - "reward_std": 0.07306739278137683, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.5881082952022552, - "rewards/quality_reward_func/std": 0.07306739557534456, - "step": 3210 - }, - { - "completion_length": 50.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 50.8, - "completions/max_terminated_length": 50.8, - "completions/mean_length": 46.3, - "completions/mean_terminated_length": 46.3, - "completions/min_length": 42.4, - "completions/min_terminated_length": 42.4, - "epoch": 0.01951728067304312, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.298828661441803, - "kl": 0.5643864538520574, - "learning_rate": 1.6959881727700508e-06, - "loss": 0.0001, - "num_tokens": 4160563.0, - "reward": 3.963113474845886, - "reward_std": 0.10927062275586649, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.463113397359848, - "rewards/quality_reward_func/std": 0.10927061671391129, - "step": 3220 - }, - { - "completion_length": 36.4, - "completions/clipped_ratio": 0.0, - "completions/max_length": 36.4, - "completions/max_terminated_length": 36.4, - "completions/mean_length": 34.2, - "completions/mean_terminated_length": 34.2, - "completions/min_length": 32.3, - "completions/min_terminated_length": 32.3, - "epoch": 0.019577893345940768, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.6677718162536621, - "kl": 0.8229854173958302, - "learning_rate": 1.679481831080531e-06, - "loss": 0.0001, - "num_tokens": 4174915.0, - "reward": 3.848472833633423, - "reward_std": 0.10531683061271906, - "rewards/coherence_reward_func/mean": 0.9774999976158142, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3709728479385377, - "rewards/quality_reward_func/std": 0.09031676249578595, - "step": 3230 - }, - { - "completion_length": 38.7, - "completions/clipped_ratio": 0.0, - "completions/max_length": 38.7, - "completions/max_terminated_length": 38.7, - "completions/mean_length": 32.525, - "completions/mean_terminated_length": 32.525, - "completions/min_length": 28.2, - "completions/min_terminated_length": 28.2, - "epoch": 0.01963850601883842, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.2357165664434433, - "kl": 1.063683407753706, - "learning_rate": 1.6630154802889859e-06, - "loss": 0.0001, - "num_tokens": 4188168.0, - "reward": 3.6370152950286867, - "reward_std": 0.08581735389307141, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.1670151889324187, - "rewards/quality_reward_func/std": 0.08581737205386161, - "step": 3240 - }, - { - "completion_length": 40.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 40.0, - "completions/max_terminated_length": 40.0, - "completions/mean_length": 37.575, - "completions/mean_terminated_length": 37.575, - "completions/min_length": 35.3, - "completions/min_terminated_length": 35.3, - "epoch": 0.01969911869173607, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.20572206377983093, - "kl": 0.7876749344170093, - "learning_rate": 1.6465899229421225e-06, - "loss": 0.0001, - "num_tokens": 4202543.0, - "reward": 3.983400321006775, - "reward_std": 0.047867730259895325, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.5134003400802611, - "rewards/quality_reward_func/std": 0.04786777477711439, - "step": 3250 - }, - { - "completion_length": 41.9, - "completions/clipped_ratio": 0.0, - "completions/max_length": 41.9, - "completions/max_terminated_length": 41.9, - "completions/mean_length": 39.8, - "completions/mean_terminated_length": 39.8, - "completions/min_length": 38.1, - "completions/min_terminated_length": 38.1, - "epoch": 0.019759731364633718, - "frac_reward_zero_std": 0.0, - "grad_norm": 0.16716791689395905, - "kl": 0.8925764039158821, - "learning_rate": 1.630205959598433e-06, - "loss": 0.0001, - "num_tokens": 4215987.0, - "reward": 3.917082405090332, - "reward_std": 0.09645090424455702, - "rewards/coherence_reward_func/mean": 0.9925000011920929, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.424582439661026, - "rewards/quality_reward_func/std": 0.08145089484751225, - "step": 3260 - }, - { - "completion_length": 40.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 40.2, - "completions/max_terminated_length": 40.2, - "completions/mean_length": 38.55, - "completions/mean_terminated_length": 38.55, - "completions/min_length": 36.8, - "completions/min_terminated_length": 36.8, - "epoch": 0.019820344037531367, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.0, - "kl": 0.7571708537638188, - "learning_rate": 1.6138643887891765e-06, - "loss": 0.0001, - "num_tokens": 4229725.0, - "reward": 3.760947751998901, - "reward_std": 0.041113514872267845, - "rewards/coherence_reward_func/mean": 0.925, - "rewards/coherence_reward_func/std": 0.017320507764816286, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3359477519989014, - "rewards/quality_reward_func/std": 0.038945339154452085, - "step": 3270 - }, - { - "completion_length": 38.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 38.8, - "completions/max_terminated_length": 38.8, - "completions/mean_length": 36.725, - "completions/mean_terminated_length": 36.725, - "completions/min_length": 35.3, - "completions/min_terminated_length": 35.3, - "epoch": 0.019880956710429016, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.2101743221282959, - "kl": 0.6681413215585053, - "learning_rate": 1.597566006979459e-06, - "loss": 0.0001, - "num_tokens": 4241722.0, - "reward": 4.057679986953735, - "reward_std": 0.09815444834530354, - "rewards/coherence_reward_func/mean": 0.9824999988079071, - "rewards/coherence_reward_func/std": 0.034999999403953555, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.5751800179481505, - "rewards/quality_reward_func/std": 0.0710665188729763, - "step": 3280 - }, - { - "completion_length": 36.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 36.2, - "completions/max_terminated_length": 36.2, - "completions/mean_length": 35.675, - "completions/mean_terminated_length": 35.675, - "completions/min_length": 35.1, - "completions/min_terminated_length": 35.1, - "epoch": 0.019941569383326665, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.15175789594650269, - "kl": 0.9505894485861063, - "learning_rate": 1.5813116085294172e-06, - "loss": 0.0001, - "num_tokens": 4254845.0, - "reward": 4.126423382759095, - "reward_std": 0.02618008037097752, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.6264233708381652, - "rewards/quality_reward_func/std": 0.026180061488412322, - "step": 3290 - }, - { - "completion_length": 39.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.0, - "completions/max_terminated_length": 39.0, - "completions/mean_length": 34.775, - "completions/mean_terminated_length": 34.775, - "completions/min_length": 30.5, - "completions/min_terminated_length": 30.5, - "epoch": 0.020002182056224314, - "frac_reward_zero_std": 0.0, - "grad_norm": 0.18640796840190887, - "kl": 0.9450171794742346, - "learning_rate": 1.5651019856554995e-06, - "loss": 0.0001, - "num_tokens": 4270140.0, - "reward": 3.4720892190933226, - "reward_std": 0.18899639658629894, - "rewards/coherence_reward_func/mean": 0.9474999994039536, - "rewards/coherence_reward_func/std": 0.034999999403953555, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.0245892763137818, - "rewards/quality_reward_func/std": 0.153996386192739, - "step": 3300 - }, - { - "completion_length": 41.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 41.8, - "completions/max_terminated_length": 41.8, - "completions/mean_length": 37.875, - "completions/mean_terminated_length": 37.875, - "completions/min_length": 35.6, - "completions/min_terminated_length": 35.6, - "epoch": 0.020062794729121966, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.18235939741134644, - "kl": 0.6776700124144555, - "learning_rate": 1.5489379283918566e-06, - "loss": 0.0, - "num_tokens": 4282883.0, - "reward": 3.8034504652023315, - "reward_std": 0.09744351711124181, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.333450472354889, - "rewards/quality_reward_func/std": 0.09744351767003537, - "step": 3310 - }, - { - "completion_length": 45.1, - "completions/clipped_ratio": 0.0, - "completions/max_length": 45.1, - "completions/max_terminated_length": 45.1, - "completions/mean_length": 40.75, - "completions/mean_terminated_length": 40.75, - "completions/min_length": 36.5, - "completions/min_terminated_length": 36.5, - "epoch": 0.020123407402019615, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.37760046124458313, - "kl": 0.949538280069828, - "learning_rate": 1.5328202245518348e-06, - "loss": 0.0001, - "num_tokens": 4295249.0, - "reward": 3.8073550939559935, - "reward_std": 0.12139484300278128, - "rewards/coherence_reward_func/mean": 0.9675000011920929, - "rewards/coherence_reward_func/std": 0.052320507168769834, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3398550689220428, - "rewards/quality_reward_func/std": 0.13598596872761845, - "step": 3320 - }, - { - "completion_length": 31.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 31.2, - "completions/max_terminated_length": 31.2, - "completions/mean_length": 30.95, - "completions/mean_terminated_length": 30.95, - "completions/min_length": 30.6, - "completions/min_terminated_length": 30.6, - "epoch": 0.020184020074917264, - "frac_reward_zero_std": 0.5, - "grad_norm": 0.0, - "kl": 0.9487326897680759, - "learning_rate": 1.5167496596895814e-06, - "loss": 0.0001, - "num_tokens": 4307515.0, - "reward": 3.90595965385437, - "reward_std": 0.07731840866617859, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4059597253799438, - "rewards/quality_reward_func/std": 0.07731841276399791, - "step": 3330 - }, - { - "completion_length": 39.4, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.4, - "completions/max_terminated_length": 39.4, - "completions/mean_length": 34.775, - "completions/mean_terminated_length": 34.775, - "completions/min_length": 31.7, - "completions/min_terminated_length": 31.7, - "epoch": 0.020244632747814913, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.23343947529792786, - "kl": 0.8172248054295779, - "learning_rate": 1.500727017061756e-06, - "loss": 0.0001, - "num_tokens": 4319442.0, - "reward": 3.517696112394333, - "reward_std": 0.2403232785873115, - "rewards/coherence_reward_func/mean": 0.925, - "rewards/coherence_reward_func/std": 0.05, - "rewards/formatting_reward_func/mean": 1.3875, - "rewards/formatting_reward_func/std": 0.075, - "rewards/quality_reward_func/mean": 1.2051961183547975, - "rewards/quality_reward_func/std": 0.11532331230118871, - "step": 3340 - }, - { - "completion_length": 39.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.8, - "completions/max_terminated_length": 39.8, - "completions/mean_length": 36.0, - "completions/mean_terminated_length": 36.0, - "completions/min_length": 32.6, - "completions/min_terminated_length": 32.6, - "epoch": 0.02030524542071256, - "frac_reward_zero_std": 0.1, - "grad_norm": 0.15448229014873505, - "kl": 0.7963626474142075, - "learning_rate": 1.4847530775893555e-06, - "loss": 0.0001, - "num_tokens": 4328866.0, - "reward": 3.692489814758301, - "reward_std": 0.12964325528591872, - "rewards/coherence_reward_func/mean": 0.9850000023841858, - "rewards/coherence_reward_func/std": 0.017320507764816286, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.207489800453186, - "rewards/quality_reward_func/std": 0.11372499875724315, - "step": 3350 - }, - { - "completion_length": 41.6, - "completions/clipped_ratio": 0.0, - "completions/max_length": 41.6, - "completions/max_terminated_length": 41.6, - "completions/mean_length": 39.8, - "completions/mean_terminated_length": 39.8, - "completions/min_length": 38.0, - "completions/min_terminated_length": 38.0, - "epoch": 0.02036585809361021, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.09208908677101135, - "kl": 0.723385076597333, - "learning_rate": 1.4688286198196524e-06, - "loss": 0.0001, - "num_tokens": 4341758.0, - "reward": 3.7400954008102416, - "reward_std": 0.06603281607385725, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2400953948497773, - "rewards/quality_reward_func/std": 0.06603283025324344, - "step": 3360 - }, - { - "completion_length": 34.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 34.0, - "completions/max_terminated_length": 34.0, - "completions/mean_length": 31.975, - "completions/mean_terminated_length": 31.975, - "completions/min_length": 30.2, - "completions/min_terminated_length": 30.2, - "epoch": 0.020426470766507863, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.2501620054244995, - "kl": 0.9304806463420391, - "learning_rate": 1.4529544198882545e-06, - "loss": 0.0001, - "num_tokens": 4354805.0, - "reward": 3.899626541137695, - "reward_std": 0.0629223863594234, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3996265590190888, - "rewards/quality_reward_func/std": 0.06292239651083946, - "step": 3370 - }, - { - "completion_length": 38.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 38.8, - "completions/max_terminated_length": 38.8, - "completions/mean_length": 37.15, - "completions/mean_terminated_length": 37.15, - "completions/min_length": 35.8, - "completions/min_terminated_length": 35.8, - "epoch": 0.02048708343940551, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.0, - "kl": 1.0188420228660107, - "learning_rate": 1.4371312514812686e-06, - "loss": 0.0001, - "num_tokens": 4369087.0, - "reward": 4.25221619606018, - "reward_std": 0.07234206513967364, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.7822161555290221, - "rewards/quality_reward_func/std": 0.07234203468542547, - "step": 3380 - }, - { - "completion_length": 39.3, - "completions/clipped_ratio": 0.0, - "completions/max_length": 39.3, - "completions/max_terminated_length": 39.3, - "completions/mean_length": 37.65, - "completions/mean_terminated_length": 37.65, - "completions/min_length": 35.9, - "completions/min_terminated_length": 35.9, - "epoch": 0.02054769611230316, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.14413855969905853, - "kl": 0.8577508796006441, - "learning_rate": 1.4213598857976024e-06, - "loss": 0.0001, - "num_tokens": 4384409.0, - "reward": 3.8944859981536863, - "reward_std": 0.11143267480656505, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.4244859516620636, - "rewards/quality_reward_func/std": 0.11143264099955559, - "step": 3390 - }, - { - "completion_length": 36.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 36.8, - "completions/max_terminated_length": 36.8, - "completions/mean_length": 34.45, - "completions/mean_terminated_length": 34.45, - "completions/min_length": 31.4, - "completions/min_terminated_length": 31.4, - "epoch": 0.02060830878520081, - "frac_reward_zero_std": 0.2, - "grad_norm": 0.0, - "kl": 1.0671353876590728, - "learning_rate": 1.405641091511368e-06, - "loss": 0.0001, - "num_tokens": 4396347.0, - "reward": 3.7803528785705565, - "reward_std": 0.07597720525227487, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2803528964519502, - "rewards/quality_reward_func/std": 0.07597720911726355, - "step": 3400 - }, - { - "completion_length": 37.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 37.5, - "completions/max_terminated_length": 37.5, - "completions/mean_length": 34.575, - "completions/mean_terminated_length": 34.575, - "completions/min_length": 32.2, - "completions/min_terminated_length": 32.2, - "epoch": 0.02066892145809846, - "frac_reward_zero_std": 0.4, - "grad_norm": 0.0, - "kl": 0.9999676614999771, - "learning_rate": 1.3899756347344235e-06, - "loss": 0.0001, - "num_tokens": 4409734.0, - "reward": 4.007402873039245, - "reward_std": 0.06215168377384543, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.5074028015136718, - "rewards/quality_reward_func/std": 0.06215167883783579, - "step": 3410 - }, - { - "completion_length": 33.8, - "completions/clipped_ratio": 0.0, - "completions/max_length": 33.8, - "completions/max_terminated_length": 33.8, - "completions/mean_length": 32.0, - "completions/mean_terminated_length": 32.0, - "completions/min_length": 30.6, - "completions/min_terminated_length": 30.6, - "epoch": 0.020729534130996107, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.2397295981645584, - "kl": 0.9598039738833904, - "learning_rate": 1.3743642789790317e-06, - "loss": 0.0001, - "num_tokens": 4420570.0, - "reward": 3.8145102739334105, - "reward_std": 0.07386084916070104, - "rewards/coherence_reward_func/mean": 0.9699999988079071, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.3445102512836455, - "rewards/quality_reward_func/std": 0.07386083230376243, - "step": 3420 - }, - { - "completion_length": 34.0, - "completions/clipped_ratio": 0.0, - "completions/max_length": 34.0, - "completions/max_terminated_length": 34.0, - "completions/mean_length": 32.725, - "completions/mean_terminated_length": 32.725, - "completions/min_length": 31.8, - "completions/min_terminated_length": 31.8, - "epoch": 0.02079014680389376, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.24414551258087158, - "kl": 0.7642092987895012, - "learning_rate": 1.358807785120647e-06, - "loss": 0.0001, - "num_tokens": 4436575.0, - "reward": 3.7026368618011474, - "reward_std": 0.0684517988935113, - "rewards/coherence_reward_func/mean": 1.0, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.2026368647813797, - "rewards/quality_reward_func/std": 0.06845179051160813, - "step": 3430 - }, - { - "completion_length": 34.5, - "completions/clipped_ratio": 0.0, - "completions/max_length": 34.5, - "completions/max_terminated_length": 34.5, - "completions/mean_length": 33.625, - "completions/mean_terminated_length": 33.625, - "completions/min_length": 32.8, - "completions/min_terminated_length": 32.8, - "epoch": 0.02085075947679141, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.40003758668899536, - "kl": 0.7258573945611715, - "learning_rate": 1.343306911360833e-06, - "loss": 0.0, - "num_tokens": 4447768.0, - "reward": 3.737292194366455, - "reward_std": 0.12735685943625868, - "rewards/coherence_reward_func/mean": 0.9925000011920929, - "rewards/coherence_reward_func/std": 0.015000002086162567, - "rewards/formatting_reward_func/mean": 1.5, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.244792139530182, - "rewards/quality_reward_func/std": 0.11480730390176178, - "step": 3440 - }, - { - "completion_length": 36.2, - "completions/clipped_ratio": 0.0, - "completions/max_length": 36.2, - "completions/max_terminated_length": 36.2, - "completions/mean_length": 33.15, - "completions/mean_terminated_length": 33.15, - "completions/min_length": 31.6, - "completions/min_terminated_length": 31.6, - "epoch": 0.020911372149689057, - "frac_reward_zero_std": 0.3, - "grad_norm": 0.867883026599884, - "kl": 1.2108185835182668, - "learning_rate": 1.3278624131903088e-06, - "loss": 0.0001, - "num_tokens": 4458758.0, - "reward": 3.524269163608551, - "reward_std": 0.07325016092509032, - "rewards/coherence_reward_func/mean": 0.9, - "rewards/coherence_reward_func/std": 0.0, - "rewards/formatting_reward_func/mean": 1.45, - "rewards/formatting_reward_func/std": 0.0, - "rewards/quality_reward_func/mean": 1.1742691427469254, - "rewards/quality_reward_func/std": 0.0732501860242337, - "step": 3450 } ], "logging_steps": 10, - "max_steps": 5000, - "num_input_tokens_seen": 4458758, + "max_steps": 13092, + "num_input_tokens_seen": 67990, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": {