diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -2,9 +2,9 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.013751375137513752, + "epoch": 0.08250825082508251, "eval_steps": 500, - "global_step": 200, + "global_step": 1200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -528,11 +528,2611 @@ "rewards/quality_reward_func/mean": 0.800000011920929, "rewards/quality_reward_func/std": 0.0, "step": 200 + }, + { + "completion_length": 24.8, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.8, + "completions/max_terminated_length": 24.8, + "completions/mean_length": 21.266666666666666, + "completions/mean_terminated_length": 21.266666666666666, + "completions/min_length": 17.666666666666668, + "completions/min_terminated_length": 17.666666666666668, + "epoch": 0.014438943894389438, + "frac_reward_zero_std": 0.9333333333333333, + "grad_norm": 0.0, + "kl": 0.8949815820577594, + "learning_rate": 4.9764731440089494e-06, + "loss": 0.0, + "num_tokens": 290559.0, + "reward": 4.001666577657064, + "reward_std": 0.030000003178914388, + "rewards/coherence_reward_func/mean": 1.2599999586741129, + "rewards/coherence_reward_func/std": 0.013333333532015483, + "rewards/formatting_reward_func/mean": 1.975, + "rewards/formatting_reward_func/std": 0.016666666666666666, + "rewards/quality_reward_func/mean": 0.7666666785875956, + "rewards/quality_reward_func/std": 0.0, + "step": 210 + }, + { + "completion_length": 20.9, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.9, + "completions/max_terminated_length": 20.9, + "completions/mean_length": 18.7, + "completions/mean_terminated_length": 18.7, + "completions/min_length": 17.1, + "completions/min_terminated_length": 17.1, + "epoch": 0.015126512651265127, + "frac_reward_zero_std": 0.9, + "grad_norm": 0.0, + "kl": 0.8888865269720554, + "learning_rate": 4.96784066268247e-06, + "loss": 0.0, + "num_tokens": 305655.0, + "reward": 4.087499904632568, + "reward_std": 0.025, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.787500011920929, + "rewards/quality_reward_func/std": 0.025, + "step": 220 + }, + { + "completion_length": 22.8, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.8, + "completions/max_terminated_length": 22.8, + "completions/mean_length": 18.1, + "completions/mean_terminated_length": 18.1, + "completions/min_length": 15.7, + "completions/min_terminated_length": 15.7, + "epoch": 0.015814081408140813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.6253720429260283, + "learning_rate": 4.957871802101782e-06, + "loss": 0.0, + "num_tokens": 319763.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 230 + }, + { + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 17.725, + "completions/mean_terminated_length": 17.725, + "completions/min_length": 14.6, + "completions/min_terminated_length": 14.6, + "epoch": 0.0165016501650165, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.773795984685421, + "learning_rate": 4.9465719605807505e-06, + "loss": 0.0, + "num_tokens": 332104.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 240 + }, + { + "completion_length": 22.4, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.4, + "completions/max_terminated_length": 22.4, + "completions/mean_length": 18.65, + "completions/mean_terminated_length": 18.65, + "completions/min_length": 15.3, + "completions/min_terminated_length": 15.3, + "epoch": 0.01718921892189219, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.7744052636437118, + "learning_rate": 4.933947257182901e-06, + "loss": 0.0, + "num_tokens": 347002.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 250 + }, + { + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.5, + "completions/max_terminated_length": 20.5, + "completions/mean_length": 17.225, + "completions/mean_terminated_length": 17.225, + "completions/min_length": 14.6, + "completions/min_terminated_length": 14.6, + "epoch": 0.017876787678767877, + "frac_reward_zero_std": 0.9, + "grad_norm": 0.0, + "kl": 0.8353285151533782, + "learning_rate": 4.920004528407837e-06, + "loss": 0.0, + "num_tokens": 360267.0, + "reward": 4.087499904632568, + "reward_std": 0.025, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.787500011920929, + "rewards/quality_reward_func/std": 0.025, + "step": 260 + }, + { + "completion_length": 28.2, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.2, + "completions/max_terminated_length": 28.2, + "completions/mean_length": 23.05, + "completions/mean_terminated_length": 23.05, + "completions/min_length": 16.9, + "completions/min_terminated_length": 16.9, + "epoch": 0.018564356435643563, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.0, + "kl": 0.7147501368832309, + "learning_rate": 4.904751324489156e-06, + "loss": 0.0, + "num_tokens": 374665.0, + "reward": 4.0024998664855955, + "reward_std": 0.07500000447034835, + "rewards/coherence_reward_func/mean": 1.2399999499320984, + "rewards/coherence_reward_func/std": 0.04999999403953552, + "rewards/formatting_reward_func/mean": 1.9625, + "rewards/formatting_reward_func/std": 0.025, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 270 + }, + { + "completion_length": 22.4, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.4, + "completions/max_terminated_length": 22.4, + "completions/mean_length": 19.775, + "completions/mean_terminated_length": 19.775, + "completions/min_length": 16.9, + "completions/min_terminated_length": 16.9, + "epoch": 0.019251925192519254, + "frac_reward_zero_std": 0.9, + "grad_norm": 0.09396308660507202, + "kl": 0.710376477369573, + "learning_rate": 4.888195905305859e-06, + "loss": 0.0, + "num_tokens": 389896.0, + "reward": 4.087499904632568, + "reward_std": 0.025, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.787500011920929, + "rewards/quality_reward_func/std": 0.025, + "step": 280 + }, + { + "completion_length": 22.3, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.3, + "completions/max_terminated_length": 22.3, + "completions/mean_length": 17.925, + "completions/mean_terminated_length": 17.925, + "completions/min_length": 15.1, + "completions/min_terminated_length": 15.1, + "epoch": 0.01993949394939494, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.5978727843612432, + "learning_rate": 4.870347235909494e-06, + "loss": 0.0, + "num_tokens": 403949.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 290 + }, + { + "completion_length": 30.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.5, + "completions/max_terminated_length": 30.5, + "completions/mean_length": 22.425, + "completions/mean_terminated_length": 22.425, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.020627062706270627, + "frac_reward_zero_std": 0.9, + "grad_norm": 0.0, + "kl": 0.7130264882929623, + "learning_rate": 4.851214981669406e-06, + "loss": 0.0, + "num_tokens": 418738.0, + "reward": 4.092499876022339, + "reward_std": 0.015000002086162567, + "rewards/coherence_reward_func/mean": 1.2924999475479126, + "rewards/coherence_reward_func/std": 0.01499999761581421, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 300 + }, + { + "completion_length": 24.3, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.3, + "completions/max_terminated_length": 24.3, + "completions/mean_length": 20.675, + "completions/mean_terminated_length": 20.675, + "completions/min_length": 18.2, + "completions/min_terminated_length": 18.2, + "epoch": 0.021314631463146314, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.6356086356565356, + "learning_rate": 4.830809503038781e-06, + "loss": 0.0, + "num_tokens": 434841.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 310 + }, + { + "completion_length": 24.2, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.2, + "completions/max_terminated_length": 24.2, + "completions/mean_length": 19.875, + "completions/mean_terminated_length": 19.875, + "completions/min_length": 15.3, + "completions/min_terminated_length": 15.3, + "epoch": 0.022002200220022004, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.8585543476045132, + "learning_rate": 4.809141849944271e-06, + "loss": 0.0, + "num_tokens": 450032.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 320 + }, + { + "completion_length": 24.1, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.1, + "completions/max_terminated_length": 24.1, + "completions/mean_length": 20.35, + "completions/mean_terminated_length": 20.35, + "completions/min_length": 17.2, + "completions/min_terminated_length": 17.2, + "epoch": 0.02268976897689769, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.7699921066872776, + "learning_rate": 4.786223755802268e-06, + "loss": 0.0, + "num_tokens": 464362.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 330 + }, + { + "completion_length": 18.4, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.4, + "completions/max_terminated_length": 18.4, + "completions/mean_length": 16.55, + "completions/mean_terminated_length": 16.55, + "completions/min_length": 14.8, + "completions/min_terminated_length": 14.8, + "epoch": 0.023377337733773377, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.7013043455779553, + "learning_rate": 4.762067631165049e-06, + "loss": 0.0, + "num_tokens": 478792.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 340 + }, + { + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 16.075, + "completions/mean_terminated_length": 16.075, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.024064906490649064, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.7579462625086307, + "learning_rate": 4.736686557000247e-06, + "loss": 0.0, + "num_tokens": 492155.0, + "reward": 3.6899999141693116, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.1699999570846558, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.8, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.720000010728836, + "rewards/quality_reward_func/std": 0.0, + "step": 350 + }, + { + "completion_length": 36.4, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.4, + "completions/max_terminated_length": 36.4, + "completions/mean_length": 27.95, + "completions/mean_terminated_length": 27.95, + "completions/min_length": 19.3, + "completions/min_terminated_length": 19.3, + "epoch": 0.024752475247524754, + "frac_reward_zero_std": 0.9, + "grad_norm": 0.0, + "kl": 0.523247461207211, + "learning_rate": 4.710094277607269e-06, + "loss": 0.0, + "num_tokens": 508017.0, + "reward": 4.077499914169311, + "reward_std": 0.04499998986721039, + "rewards/coherence_reward_func/mean": 1.2899999499320984, + "rewards/coherence_reward_func/std": 0.020000000298023225, + "rewards/formatting_reward_func/mean": 1.9875, + "rewards/formatting_reward_func/std": 0.025, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 360 + }, + { + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 17.825, + "completions/mean_terminated_length": 17.825, + "completions/min_length": 15.2, + "completions/min_terminated_length": 15.2, + "epoch": 0.02544004400440044, + "frac_reward_zero_std": 0.9, + "grad_norm": 0.0, + "kl": 0.8155595321208239, + "learning_rate": 4.682305193174524e-06, + "loss": 0.0, + "num_tokens": 521294.0, + "reward": 4.087499904632568, + "reward_std": 0.025, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.787500011920929, + "rewards/quality_reward_func/std": 0.025, + "step": 370 + }, + { + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.5, + "completions/max_terminated_length": 23.5, + "completions/mean_length": 20.825, + "completions/mean_terminated_length": 20.825, + "completions/min_length": 18.1, + "completions/min_terminated_length": 18.1, + "epoch": 0.026127612761276127, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.7052144195884467, + "learning_rate": 4.653334351981464e-06, + "loss": 0.0, + "num_tokens": 536163.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 380 + }, + { + "completion_length": 23.2, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.2, + "completions/max_terminated_length": 23.2, + "completions/mean_length": 19.45, + "completions/mean_terminated_length": 19.45, + "completions/min_length": 16.3, + "completions/min_terminated_length": 16.3, + "epoch": 0.026815181518151814, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.798173398245126, + "learning_rate": 4.623197442249667e-06, + "loss": 0.0, + "num_tokens": 552493.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 390 + }, + { + "completion_length": 26.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 21.675, + "completions/mean_terminated_length": 21.675, + "completions/min_length": 18.2, + "completions/min_terminated_length": 18.2, + "epoch": 0.027502750275027504, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.6294201251119376, + "learning_rate": 4.591910783647405e-06, + "loss": 0.0, + "num_tokens": 567780.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 400 + }, + { + "completion_length": 20.6, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.6, + "completions/max_terminated_length": 20.6, + "completions/mean_length": 18.425, + "completions/mean_terminated_length": 18.425, + "completions/min_length": 16.7, + "completions/min_terminated_length": 16.7, + "epoch": 0.02819031903190319, + "frac_reward_zero_std": 0.9, + "grad_norm": 0.0, + "kl": 0.8350168326869607, + "learning_rate": 4.559491318452238e-06, + "loss": 0.0, + "num_tokens": 579861.0, + "reward": 4.06999990940094, + "reward_std": 0.059999996423721315, + "rewards/coherence_reward_func/mean": 1.2824999570846558, + "rewards/coherence_reward_func/std": 0.034999996423721313, + "rewards/formatting_reward_func/mean": 1.9875, + "rewards/formatting_reward_func/std": 0.025, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 410 + }, + { + "completion_length": 25.1, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.1, + "completions/max_terminated_length": 25.1, + "completions/mean_length": 20.95, + "completions/mean_terminated_length": 20.95, + "completions/min_length": 17.4, + "completions/min_terminated_length": 17.4, + "epoch": 0.028877887788778877, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.6226312205195427, + "learning_rate": 4.525956602376486e-06, + "loss": 0.0, + "num_tokens": 593839.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 420 + }, + { + "completion_length": 21.7, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.7, + "completions/max_terminated_length": 21.7, + "completions/mean_length": 19.275, + "completions/mean_terminated_length": 19.275, + "completions/min_length": 17.3, + "completions/min_terminated_length": 17.3, + "epoch": 0.029565456545654567, + "frac_reward_zero_std": 0.9, + "grad_norm": 0.0, + "kl": 0.6491609741002321, + "learning_rate": 4.491324795060491e-06, + "loss": 0.0, + "num_tokens": 609682.0, + "reward": 4.077499914169311, + "reward_std": 0.01499999314546585, + "rewards/coherence_reward_func/mean": 1.2774999618530274, + "rewards/coherence_reward_func/std": 0.01499999761581421, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 430 + }, + { + "completion_length": 20.3, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.3, + "completions/max_terminated_length": 20.3, + "completions/mean_length": 17.6, + "completions/mean_terminated_length": 17.6, + "completions/min_length": 14.6, + "completions/min_terminated_length": 14.6, + "epoch": 0.030253025302530254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.5745274079963565, + "learning_rate": 4.455614650238858e-06, + "loss": 0.0, + "num_tokens": 623798.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 440 + }, + { + "completion_length": 24.1, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.1, + "completions/max_terminated_length": 24.1, + "completions/mean_length": 19.7, + "completions/mean_terminated_length": 19.7, + "completions/min_length": 16.6, + "completions/min_terminated_length": 16.6, + "epoch": 0.03094059405940594, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00047155821812339127, + "kl": 0.6368549924343825, + "learning_rate": 4.418845505584972e-06, + "loss": 0.0, + "num_tokens": 637042.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 450 + }, + { + "completion_length": 22.9, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.9, + "completions/max_terminated_length": 22.9, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 15.2, + "completions/min_terminated_length": 15.2, + "epoch": 0.03162816281628163, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.7010985561646521, + "learning_rate": 4.381037272239311e-06, + "loss": 0.0, + "num_tokens": 652746.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 460 + }, + { + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.5, + "completions/max_terminated_length": 21.5, + "completions/mean_length": 18.65, + "completions/mean_terminated_length": 18.65, + "completions/min_length": 15.2, + "completions/min_terminated_length": 15.2, + "epoch": 0.032315731573157314, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.8845027022063732, + "learning_rate": 4.34221042402721e-06, + "loss": 0.0, + "num_tokens": 666280.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 470 + }, + { + "completion_length": 21.8, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.8, + "completions/max_terminated_length": 21.8, + "completions/mean_length": 18.95, + "completions/mean_terminated_length": 18.95, + "completions/min_length": 15.8, + "completions/min_terminated_length": 15.8, + "epoch": 0.033003300330033, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 1.0147626213729382, + "learning_rate": 4.302385986371924e-06, + "loss": 0.0, + "num_tokens": 680162.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 480 + }, + { + "completion_length": 19.3, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.3, + "completions/max_terminated_length": 19.3, + "completions/mean_length": 16.675, + "completions/mean_terminated_length": 16.675, + "completions/min_length": 14.5, + "completions/min_terminated_length": 14.5, + "epoch": 0.033690869086908694, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 1.1609789356589317, + "learning_rate": 4.261585524908987e-06, + "loss": 0.0, + "num_tokens": 694665.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 490 + }, + { + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.5, + "completions/max_terminated_length": 23.5, + "completions/mean_length": 20.45, + "completions/mean_terminated_length": 20.45, + "completions/min_length": 17.9, + "completions/min_terminated_length": 17.9, + "epoch": 0.03437843784378438, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.47180836610496046, + "learning_rate": 4.2198311338080466e-06, + "loss": 0.0, + "num_tokens": 711203.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 500 + }, + { + "completion_length": 22.1, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.1, + "completions/max_terminated_length": 22.1, + "completions/mean_length": 16.975, + "completions/mean_terminated_length": 16.975, + "completions/min_length": 13.9, + "completions/min_terminated_length": 13.9, + "epoch": 0.03506600660066007, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.7761064074933529, + "learning_rate": 4.177145423808477e-06, + "loss": 0.0, + "num_tokens": 723994.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 510 + }, + { + "completion_length": 27.9, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.9, + "completions/max_terminated_length": 27.9, + "completions/mean_length": 21.05, + "completions/mean_terminated_length": 21.05, + "completions/min_length": 16.3, + "completions/min_terminated_length": 16.3, + "epoch": 0.035753575357535754, + "frac_reward_zero_std": 0.9, + "grad_norm": 0.0, + "kl": 0.9148745775222779, + "learning_rate": 4.133551509975264e-06, + "loss": 0.0, + "num_tokens": 738352.0, + "reward": 4.092499876022339, + "reward_std": 0.015000002086162567, + "rewards/coherence_reward_func/mean": 1.2924999475479126, + "rewards/coherence_reward_func/std": 0.01499999761581421, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 520 + }, + { + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.5, + "completions/max_terminated_length": 24.5, + "completions/mean_length": 18.05, + "completions/mean_terminated_length": 18.05, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.03644114411441144, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.8287635251879693, + "learning_rate": 4.089072999181792e-06, + "loss": 0.0, + "num_tokens": 755102.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 530 + }, + { + "completion_length": 20.6, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.6, + "completions/max_terminated_length": 20.6, + "completions/mean_length": 16.975, + "completions/mean_terminated_length": 16.975, + "completions/min_length": 14.5, + "completions/min_terminated_length": 14.5, + "epoch": 0.03712871287128713, + "frac_reward_zero_std": 0.9, + "grad_norm": 0.0, + "kl": 0.6910347867757082, + "learning_rate": 4.043733977326304e-06, + "loss": 0.0, + "num_tokens": 768025.0, + "reward": 3.8949999094009398, + "reward_std": 0.23671360015869142, + "rewards/coherence_reward_func/mean": 1.23499995470047, + "rewards/coherence_reward_func/std": 0.07505553364753723, + "rewards/formatting_reward_func/mean": 1.9, + "rewards/formatting_reward_func/std": 0.1154700517654419, + "rewards/quality_reward_func/mean": 0.7600000113248825, + "rewards/quality_reward_func/std": 0.046188023686408994, + "step": 540 + }, + { + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 19.85, + "completions/mean_terminated_length": 19.85, + "completions/min_length": 16.4, + "completions/min_terminated_length": 16.4, + "epoch": 0.037816281628162814, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.6972396697849035, + "learning_rate": 3.997558996288965e-06, + "loss": 0.0, + "num_tokens": 782131.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 550 + }, + { + "completion_length": 23.7, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.7, + "completions/max_terminated_length": 23.7, + "completions/mean_length": 19.5, + "completions/mean_terminated_length": 19.5, + "completions/min_length": 16.3, + "completions/min_terminated_length": 16.3, + "epoch": 0.03850385038503851, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.6881673350930214, + "learning_rate": 3.9505730606365826e-06, + "loss": 0.0, + "num_tokens": 797371.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 560 + }, + { + "completion_length": 22.1, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.1, + "completions/max_terminated_length": 22.1, + "completions/mean_length": 18.05, + "completions/mean_terminated_length": 18.05, + "completions/min_length": 14.8, + "completions/min_terminated_length": 14.8, + "epoch": 0.039191419141914194, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.6732351541519165, + "learning_rate": 3.902801614082195e-06, + "loss": 0.0, + "num_tokens": 812681.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 570 + }, + { + "completion_length": 19.4, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.4, + "completions/max_terminated_length": 19.4, + "completions/mean_length": 16.35, + "completions/mean_terminated_length": 16.35, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.03987898789878988, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.8790098778903485, + "learning_rate": 3.85427052570685e-06, + "loss": 0.0, + "num_tokens": 828143.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 580 + }, + { + "completion_length": 25.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.5, + "completions/max_terminated_length": 25.5, + "completions/mean_length": 20.025, + "completions/mean_terminated_length": 20.025, + "completions/min_length": 15.7, + "completions/min_terminated_length": 15.7, + "epoch": 0.04056655665566557, + "frac_reward_zero_std": 0.9, + "grad_norm": 0.0, + "kl": 0.6692779961973429, + "learning_rate": 3.8050060759510453e-06, + "loss": 0.0, + "num_tokens": 843544.0, + "reward": 4.092499876022339, + "reward_std": 0.015000002086162567, + "rewards/coherence_reward_func/mean": 1.2924999475479126, + "rewards/coherence_reward_func/std": 0.01499999761581421, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 590 + }, + { + "completion_length": 22.9, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.9, + "completions/max_terminated_length": 22.9, + "completions/mean_length": 20.725, + "completions/mean_terminated_length": 20.725, + "completions/min_length": 18.4, + "completions/min_terminated_length": 18.4, + "epoch": 0.041254125412541254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.842956809559837, + "learning_rate": 3.755034942383401e-06, + "loss": 0.0, + "num_tokens": 858861.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 600 + }, + { + "completion_length": 21.1, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.1, + "completions/max_terminated_length": 21.1, + "completions/mean_length": 17.675, + "completions/mean_terminated_length": 17.675, + "completions/min_length": 15.6, + "completions/min_terminated_length": 15.6, + "epoch": 0.04194169416941694, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.741293515264988, + "learning_rate": 3.7043841852542884e-06, + "loss": 0.0, + "num_tokens": 872064.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 610 + }, + { + "completion_length": 21.9, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.9, + "completions/max_terminated_length": 21.9, + "completions/mean_length": 19.125, + "completions/mean_terminated_length": 19.125, + "completions/min_length": 16.2, + "completions/min_terminated_length": 16.2, + "epoch": 0.04262926292629263, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.8384895706549287, + "learning_rate": 3.6530812328422272e-06, + "loss": 0.0, + "num_tokens": 886225.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 620 + }, + { + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.5, + "completions/max_terminated_length": 21.5, + "completions/mean_length": 19.625, + "completions/mean_terminated_length": 19.625, + "completions/min_length": 17.8, + "completions/min_terminated_length": 17.8, + "epoch": 0.043316831683168314, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.7357438832521439, + "learning_rate": 3.6011538666009877e-06, + "loss": 0.0, + "num_tokens": 902306.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 630 + }, + { + "completion_length": 22.9, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.9, + "completions/max_terminated_length": 22.9, + "completions/mean_length": 19.075, + "completions/mean_terminated_length": 19.075, + "completions/min_length": 15.9, + "completions/min_terminated_length": 15.9, + "epoch": 0.04400440044004401, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.8144454814493656, + "learning_rate": 3.5486302061154433e-06, + "loss": 0.0, + "num_tokens": 915053.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 640 + }, + { + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.525, + "completions/mean_terminated_length": 15.525, + "completions/min_length": 13.4, + "completions/min_terminated_length": 13.4, + "epoch": 0.044691969196919694, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.821483525633812, + "learning_rate": 3.4955386938743217e-06, + "loss": 0.0, + "num_tokens": 928142.0, + "reward": 4.0499999046325685, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.7500000119209289, + "rewards/quality_reward_func/std": 0.0, + "step": 650 + }, + { + "completion_length": 21.1, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.1, + "completions/max_terminated_length": 21.1, + "completions/mean_length": 17.85, + "completions/mean_terminated_length": 17.85, + "completions/min_length": 14.2, + "completions/min_terminated_length": 14.2, + "epoch": 0.04537953795379538, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.8167589947581291, + "learning_rate": 3.4419080798680934e-06, + "loss": 0.0, + "num_tokens": 943068.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 660 + }, + { + "completion_length": 24.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.825, + "completions/mean_terminated_length": 21.825, + "completions/min_length": 18.7, + "completions/min_terminated_length": 18.7, + "epoch": 0.04606710671067107, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.7352058874210343, + "learning_rate": 3.387767406020343e-06, + "loss": 0.0, + "num_tokens": 957793.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 670 + }, + { + "completion_length": 20.3, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.3, + "completions/max_terminated_length": 20.3, + "completions/mean_length": 18.025, + "completions/mean_terminated_length": 18.025, + "completions/min_length": 16.3, + "completions/min_terminated_length": 16.3, + "epoch": 0.046754675467546754, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.9711360923945904, + "learning_rate": 3.333145990461061e-06, + "loss": 0.0, + "num_tokens": 973398.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 680 + }, + { + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.5, + "completions/max_terminated_length": 22.5, + "completions/mean_length": 19.35, + "completions/mean_terminated_length": 19.35, + "completions/min_length": 16.2, + "completions/min_terminated_length": 16.2, + "epoch": 0.04744224422442244, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.6931308001279831, + "learning_rate": 3.2780734116503504e-06, + "loss": 0.0, + "num_tokens": 986624.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 690 + }, + { + "completion_length": 22.2, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.2, + "completions/max_terminated_length": 22.2, + "completions/mean_length": 17.6, + "completions/mean_terminated_length": 17.6, + "completions/min_length": 13.5, + "completions/min_terminated_length": 13.5, + "epoch": 0.04812981298129813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.913802171498537, + "learning_rate": 3.222579492361179e-06, + "loss": 0.0, + "num_tokens": 999188.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 700 + }, + { + "completion_length": 15.9, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.9, + "completions/max_terminated_length": 15.9, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 13.4, + "completions/min_terminated_length": 13.4, + "epoch": 0.04881738173817382, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.417941717198119e-05, + "kl": 0.9791606456041336, + "learning_rate": 3.1666942835298143e-06, + "loss": 0.0, + "num_tokens": 1013474.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 710 + }, + { + "completion_length": 26.8, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.8, + "completions/max_terminated_length": 26.8, + "completions/mean_length": 21.575, + "completions/mean_terminated_length": 21.575, + "completions/min_length": 17.1, + "completions/min_terminated_length": 17.1, + "epoch": 0.04950495049504951, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.561687457934022, + "learning_rate": 3.110448047982714e-06, + "loss": 0.0, + "num_tokens": 1027477.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 720 + }, + { + "completion_length": 25.9, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.9, + "completions/max_terminated_length": 25.9, + "completions/mean_length": 21.275, + "completions/mean_terminated_length": 21.275, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.050192519251925194, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.7257001161575317, + "learning_rate": 3.053871244048669e-06, + "loss": 0.0, + "num_tokens": 1041216.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 730 + }, + { + "completion_length": 23.3, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.3, + "completions/max_terminated_length": 23.3, + "completions/mean_length": 19.725, + "completions/mean_terminated_length": 19.725, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.05088008800880088, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.9793748699128628, + "learning_rate": 2.9969945090650866e-06, + "loss": 0.0, + "num_tokens": 1055529.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 740 + }, + { + "completion_length": 23.2, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.2, + "completions/max_terminated_length": 23.2, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 15.7, + "completions/min_terminated_length": 15.7, + "epoch": 0.05156765676567657, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.6666584078222513, + "learning_rate": 2.9398486427873276e-06, + "loss": 0.0, + "num_tokens": 1070861.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 750 + }, + { + "completion_length": 18.7, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.7, + "completions/max_terminated_length": 18.7, + "completions/mean_length": 16.675, + "completions/mean_terminated_length": 16.675, + "completions/min_length": 14.4, + "completions/min_terminated_length": 14.4, + "epoch": 0.052255225522552254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 1.0366931222379208, + "learning_rate": 2.8824645907100957e-06, + "loss": 0.0, + "num_tokens": 1087072.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 760 + }, + { + "completion_length": 26.8, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.8, + "completions/max_terminated_length": 26.8, + "completions/mean_length": 19.2, + "completions/mean_terminated_length": 19.2, + "completions/min_length": 14.8, + "completions/min_terminated_length": 14.8, + "epoch": 0.05294279427942794, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.8708533995551988, + "learning_rate": 2.824873427309907e-06, + "loss": 0.0, + "num_tokens": 1102088.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 770 + }, + { + "completion_length": 23.1, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.1, + "completions/max_terminated_length": 23.1, + "completions/mean_length": 18.425, + "completions/mean_terminated_length": 18.425, + "completions/min_length": 14.4, + "completions/min_terminated_length": 14.4, + "epoch": 0.05363036303630363, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.7705225108191371, + "learning_rate": 2.7671063392177133e-06, + "loss": 0.0, + "num_tokens": 1116397.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 780 + }, + { + "completion_length": 24.8, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.8, + "completions/max_terminated_length": 24.8, + "completions/mean_length": 20.55, + "completions/mean_terminated_length": 20.55, + "completions/min_length": 17.9, + "completions/min_terminated_length": 17.9, + "epoch": 0.05431793179317932, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.6953186644241214, + "learning_rate": 2.70919460833079e-06, + "loss": 0.0, + "num_tokens": 1134011.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 790 + }, + { + "completion_length": 24.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.5, + "completions/max_terminated_length": 24.5, + "completions/mean_length": 20.75, + "completions/mean_terminated_length": 20.75, + "completions/min_length": 17.9, + "completions/min_terminated_length": 17.9, + "epoch": 0.05500550055005501, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.5938798669725657, + "learning_rate": 2.6511695948730357e-06, + "loss": 0.0, + "num_tokens": 1149273.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 800 + }, + { + "completion_length": 20.3, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.3, + "completions/max_terminated_length": 20.3, + "completions/mean_length": 18.075, + "completions/mean_terminated_length": 18.075, + "completions/min_length": 15.6, + "completions/min_terminated_length": 15.6, + "epoch": 0.055693069306930694, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 1.0365911795757712, + "learning_rate": 2.593062720412865e-06, + "loss": 0.0, + "num_tokens": 1162224.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 810 + }, + { + "completion_length": 24.2, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.2, + "completions/max_terminated_length": 24.2, + "completions/mean_length": 20.7, + "completions/mean_terminated_length": 20.7, + "completions/min_length": 17.7, + "completions/min_terminated_length": 17.7, + "epoch": 0.05638063806380638, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.6314735591411591, + "learning_rate": 2.5349054508478636e-06, + "loss": 0.0, + "num_tokens": 1178732.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 820 + }, + { + "completion_length": 16.8, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.8, + "completions/max_terminated_length": 16.8, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 13.7, + "completions/min_terminated_length": 13.7, + "epoch": 0.05706820682068207, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.8896382354199887, + "learning_rate": 2.4767292793654587e-06, + "loss": 0.0, + "num_tokens": 1190762.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 830 + }, + { + "completion_length": 20.6, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.6, + "completions/max_terminated_length": 20.6, + "completions/mean_length": 18.125, + "completions/mean_terminated_length": 18.125, + "completions/min_length": 15.7, + "completions/min_terminated_length": 15.7, + "epoch": 0.057755775577557754, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 1.0564545184373855, + "learning_rate": 2.4185657093887975e-06, + "loss": 0.0, + "num_tokens": 1205791.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 840 + }, + { + "completion_length": 22.8, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.8, + "completions/max_terminated_length": 22.8, + "completions/mean_length": 19.375, + "completions/mean_terminated_length": 19.375, + "completions/min_length": 15.4, + "completions/min_terminated_length": 15.4, + "epoch": 0.05844334433443344, + "frac_reward_zero_std": 0.9, + "grad_norm": 0.0, + "kl": 1.0678101744502784, + "learning_rate": 2.3604462375170905e-06, + "loss": 0.0, + "num_tokens": 1220030.0, + "reward": 4.024999904632568, + "reward_std": 0.15, + "rewards/coherence_reward_func/mean": 1.2824999570846558, + "rewards/coherence_reward_func/std": 0.034999996423721313, + "rewards/formatting_reward_func/mean": 1.9625, + "rewards/formatting_reward_func/std": 0.075, + "rewards/quality_reward_func/mean": 0.7800000131130218, + "rewards/quality_reward_func/std": 0.04000000059604645, + "step": 850 + }, + { + "completion_length": 21.9, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.9, + "completions/max_terminated_length": 21.9, + "completions/mean_length": 18.625, + "completions/mean_terminated_length": 18.625, + "completions/min_length": 15.9, + "completions/min_terminated_length": 15.9, + "epoch": 0.059130913091309134, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.8187783338828012, + "learning_rate": 2.3024023364696473e-06, + "loss": 0.0, + "num_tokens": 1234427.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 860 + }, + { + "completion_length": 22.3, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.3, + "completions/max_terminated_length": 22.3, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 15.4, + "completions/min_terminated_length": 15.4, + "epoch": 0.05981848184818482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.962415424734354, + "learning_rate": 2.2444654380428413e-06, + "loss": 0.0, + "num_tokens": 1246113.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 870 + }, + { + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.5, + "completions/max_terminated_length": 22.5, + "completions/mean_length": 19.1, + "completions/mean_terminated_length": 19.1, + "completions/min_length": 15.8, + "completions/min_terminated_length": 15.8, + "epoch": 0.06050605060506051, + "frac_reward_zero_std": 0.9, + "grad_norm": 0.7348126769065857, + "kl": 3.9835946768522263, + "learning_rate": 2.186666916089239e-06, + "loss": 0.0002, + "num_tokens": 1262277.0, + "reward": 4.087499904632568, + "reward_std": 0.025, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.787500011920929, + "rewards/quality_reward_func/std": 0.025, + "step": 880 + }, + { + "completion_length": 23.6, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.6, + "completions/max_terminated_length": 23.6, + "completions/mean_length": 17.7, + "completions/mean_terminated_length": 17.7, + "completions/min_length": 14.3, + "completions/min_terminated_length": 14.3, + "epoch": 0.061193619361936194, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.8025905980728567, + "learning_rate": 2.1290380695281083e-06, + "loss": 0.0, + "num_tokens": 1276361.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 890 + }, + { + "completion_length": 21.6, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.6, + "completions/max_terminated_length": 21.6, + "completions/mean_length": 18.475, + "completions/mean_terminated_length": 18.475, + "completions/min_length": 15.3, + "completions/min_terminated_length": 15.3, + "epoch": 0.06188118811881188, + "frac_reward_zero_std": 0.9, + "grad_norm": 0.0, + "kl": 0.5589495476335287, + "learning_rate": 2.0716101053964965e-06, + "loss": 0.0, + "num_tokens": 1290076.0, + "reward": 4.077499914169311, + "reward_std": 0.01499999314546585, + "rewards/coherence_reward_func/mean": 1.2774999618530274, + "rewards/coherence_reward_func/std": 0.01499999761581421, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 900 + }, + { + "completion_length": 19.4, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.4, + "completions/max_terminated_length": 19.4, + "completions/mean_length": 17.225, + "completions/mean_terminated_length": 17.225, + "completions/min_length": 15.2, + "completions/min_terminated_length": 15.2, + "epoch": 0.06256875687568757, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 1.0769402489066124, + "learning_rate": 2.0144141219500707e-06, + "loss": 0.0, + "num_tokens": 1304321.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 910 + }, + { + "completion_length": 24.2, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.2, + "completions/max_terminated_length": 24.2, + "completions/mean_length": 20.775, + "completions/mean_terminated_length": 20.775, + "completions/min_length": 17.4, + "completions/min_terminated_length": 17.4, + "epoch": 0.06325632563256325, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.7756512950756587, + "learning_rate": 1.9574810918228667e-06, + "loss": 0.0, + "num_tokens": 1318764.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 920 + }, + { + "completion_length": 28.2, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.2, + "completions/max_terminated_length": 28.2, + "completions/mean_length": 24.375, + "completions/mean_terminated_length": 24.375, + "completions/min_length": 21.1, + "completions/min_terminated_length": 21.1, + "epoch": 0.06394389438943894, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.987953900906723, + "learning_rate": 1.9008418452550579e-06, + "loss": 0.0, + "num_tokens": 1332931.0, + "reward": 3.9799998998641968, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2299999594688416, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.95, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 930 + }, + { + "completion_length": 21.2, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.2, + "completions/max_terminated_length": 21.2, + "completions/mean_length": 18.475, + "completions/mean_terminated_length": 18.475, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.06463146314631463, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.7319877591449767, + "learning_rate": 1.8445270533978387e-06, + "loss": 0.0, + "num_tokens": 1344754.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 940 + }, + { + "completion_length": 22.2, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.2, + "completions/max_terminated_length": 22.2, + "completions/mean_length": 18.4, + "completions/mean_terminated_length": 18.4, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.06531903190319031, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.8628140719607472, + "learning_rate": 1.788567211704453e-06, + "loss": 0.0, + "num_tokens": 1359610.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 950 + }, + { + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 13.8, + "completions/min_terminated_length": 13.8, + "epoch": 0.066006600660066, + "frac_reward_zero_std": 0.9, + "grad_norm": 2.449816201988142e-05, + "kl": 0.7857629887759685, + "learning_rate": 1.7329926234163694e-06, + "loss": 0.0, + "num_tokens": 1373758.0, + "reward": 4.087499904632568, + "reward_std": 0.025, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.787500011920929, + "rewards/quality_reward_func/std": 0.025, + "step": 960 + }, + { + "completion_length": 22.3, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.3, + "completions/max_terminated_length": 22.3, + "completions/mean_length": 19.05, + "completions/mean_terminated_length": 19.05, + "completions/min_length": 15.1, + "completions/min_terminated_length": 15.1, + "epoch": 0.0666941694169417, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.736820934060961, + "learning_rate": 1.677833383153542e-06, + "loss": 0.0, + "num_tokens": 1387248.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 970 + }, + { + "completion_length": 22.6, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.6, + "completions/max_terminated_length": 22.6, + "completions/mean_length": 18.175, + "completions/mean_terminated_length": 18.175, + "completions/min_length": 14.6, + "completions/min_terminated_length": 14.6, + "epoch": 0.06738173817381739, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.5687850050628185, + "learning_rate": 1.6231193606176415e-06, + "loss": 0.0, + "num_tokens": 1402547.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 980 + }, + { + "completion_length": 21.2, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.2, + "completions/max_terminated_length": 21.2, + "completions/mean_length": 18.025, + "completions/mean_terminated_length": 18.025, + "completions/min_length": 15.8, + "completions/min_terminated_length": 15.8, + "epoch": 0.06806930693069307, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.82333648793865, + "learning_rate": 1.5688801844170846e-06, + "loss": 0.0, + "num_tokens": 1416408.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 990 + }, + { + "completion_length": 23.7, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.7, + "completions/max_terminated_length": 23.7, + "completions/mean_length": 19.85, + "completions/mean_terminated_length": 19.85, + "completions/min_length": 17.1, + "completions/min_terminated_length": 17.1, + "epoch": 0.06875687568756876, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.819248874951154, + "learning_rate": 1.5151452260226224e-06, + "loss": 0.0, + "num_tokens": 1429986.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 1000 + }, + { + "completion_length": 20.2, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.2, + "completions/max_terminated_length": 20.2, + "completions/mean_length": 17.6, + "completions/mean_terminated_length": 17.6, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.06944444444444445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.7394288497045636, + "learning_rate": 1.4619435838621677e-06, + "loss": 0.0, + "num_tokens": 1445078.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 1010 + }, + { + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.5, + "completions/max_terminated_length": 23.5, + "completions/mean_length": 20.9, + "completions/mean_terminated_length": 20.9, + "completions/min_length": 17.6, + "completions/min_terminated_length": 17.6, + "epoch": 0.07013201320132013, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.5150950760114938, + "learning_rate": 1.4093040675634834e-06, + "loss": 0.0, + "num_tokens": 1460718.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 1020 + }, + { + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 18.525, + "completions/mean_terminated_length": 18.525, + "completions/min_length": 15.9, + "completions/min_terminated_length": 15.9, + "epoch": 0.07081958195819582, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.8705919556319713, + "learning_rate": 1.3572551823532654e-06, + "loss": 0.0, + "num_tokens": 1474799.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 1030 + }, + { + "completion_length": 20.4, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.4, + "completions/max_terminated_length": 20.4, + "completions/mean_length": 17.7, + "completions/mean_terminated_length": 17.7, + "completions/min_length": 15.7, + "completions/min_terminated_length": 15.7, + "epoch": 0.07150715071507151, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.7958902813494205, + "learning_rate": 1.305825113621051e-06, + "loss": 0.0, + "num_tokens": 1487611.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 1040 + }, + { + "completion_length": 22.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.5, + "completions/max_terminated_length": 22.5, + "completions/mean_length": 19.075, + "completions/mean_terminated_length": 19.075, + "completions/min_length": 15.7, + "completions/min_terminated_length": 15.7, + "epoch": 0.0721947194719472, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.6268668082542718, + "learning_rate": 1.2550417116563413e-06, + "loss": 0.0, + "num_tokens": 1501134.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 1050 + }, + { + "completion_length": 22.2, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.2, + "completions/max_terminated_length": 22.2, + "completions/mean_length": 18.45, + "completions/mean_terminated_length": 18.45, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.07288228822882288, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.7750688172876835, + "learning_rate": 1.204932476567175e-06, + "loss": 0.0, + "num_tokens": 1515356.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 1060 + }, + { + "completion_length": 17.8, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.8, + "completions/max_terminated_length": 17.8, + "completions/mean_length": 15.325, + "completions/mean_terminated_length": 15.325, + "completions/min_length": 13.6, + "completions/min_terminated_length": 13.6, + "epoch": 0.07356985698569857, + "frac_reward_zero_std": 0.9, + "grad_norm": 0.0, + "kl": 0.8960678532719613, + "learning_rate": 1.1555245433883322e-06, + "loss": 0.0, + "num_tokens": 1529909.0, + "reward": 4.087499904632568, + "reward_std": 0.025, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.787500011920929, + "rewards/quality_reward_func/std": 0.025, + "step": 1070 + }, + { + "completion_length": 22.9, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.9, + "completions/max_terminated_length": 22.9, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 14.7, + "completions/min_terminated_length": 14.7, + "epoch": 0.07425742574257425, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.7110856842249632, + "learning_rate": 1.1068446673872394e-06, + "loss": 0.0, + "num_tokens": 1545037.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 1080 + }, + { + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.15, + "completions/mean_terminated_length": 19.15, + "completions/min_length": 16.1, + "completions/min_terminated_length": 16.1, + "epoch": 0.07494499449944994, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.8430487784557045, + "learning_rate": 1.0589192095755172e-06, + "loss": 0.0, + "num_tokens": 1560495.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 1090 + }, + { + "completion_length": 19.9, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.9, + "completions/max_terminated_length": 19.9, + "completions/mean_length": 17.825, + "completions/mean_terminated_length": 17.825, + "completions/min_length": 15.5, + "completions/min_terminated_length": 15.5, + "epoch": 0.07563256325632563, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.8941279411315918, + "learning_rate": 1.0117741224340255e-06, + "loss": 0.0, + "num_tokens": 1575040.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 1100 + }, + { + "completion_length": 20.1, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.1, + "completions/max_terminated_length": 20.1, + "completions/mean_length": 17.175, + "completions/mean_terminated_length": 17.175, + "completions/min_length": 14.5, + "completions/min_terminated_length": 14.5, + "epoch": 0.07632013201320131, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.8955685901921242, + "learning_rate": 9.654349358591437e-07, + "loss": 0.0, + "num_tokens": 1590247.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 1110 + }, + { + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 19.425, + "completions/mean_terminated_length": 19.425, + "completions/min_length": 17.7, + "completions/min_terminated_length": 17.7, + "epoch": 0.07700770077007701, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.836240575928241, + "learning_rate": 9.199267433378728e-07, + "loss": 0.0, + "num_tokens": 1606988.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 1120 + }, + { + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.5, + "completions/max_terminated_length": 20.5, + "completions/mean_length": 17.625, + "completions/mean_terminated_length": 17.625, + "completions/min_length": 15.3, + "completions/min_terminated_length": 15.3, + "epoch": 0.0776952695269527, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.8303040105034597, + "learning_rate": 8.752741883592792e-07, + "loss": 0.0, + "num_tokens": 1622165.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 1130 + }, + { + "completion_length": 21.7, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.7, + "completions/max_terminated_length": 21.7, + "completions/mean_length": 17.95, + "completions/mean_terminated_length": 17.95, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.07838283828382839, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.9129018164705485, + "learning_rate": 8.315014510696004e-07, + "loss": 0.0, + "num_tokens": 1637579.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 1140 + }, + { + "completion_length": 18.1, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.1, + "completions/max_terminated_length": 18.1, + "completions/mean_length": 16.95, + "completions/mean_terminated_length": 16.95, + "completions/min_length": 15.6, + "completions/min_terminated_length": 15.6, + "epoch": 0.07907040704070407, + "frac_reward_zero_std": 0.9, + "grad_norm": 0.0, + "kl": 0.8532706722617149, + "learning_rate": 7.886322351782782e-07, + "loss": 0.0, + "num_tokens": 1650761.0, + "reward": 4.074999904632568, + "reward_std": 0.028867512941360474, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.7750000119209289, + "rewards/quality_reward_func/std": 0.028867512941360474, + "step": 1150 + }, + { + "completion_length": 19.7, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.7, + "completions/max_terminated_length": 19.7, + "completions/mean_length": 16.925, + "completions/mean_terminated_length": 16.925, + "completions/min_length": 14.5, + "completions/min_terminated_length": 14.5, + "epoch": 0.07975797579757976, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.8230630304664374, + "learning_rate": 7.466897551219779e-07, + "loss": 0.0, + "num_tokens": 1666838.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 1160 + }, + { + "completion_length": 22.4, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.4, + "completions/max_terminated_length": 22.4, + "completions/mean_length": 18.775, + "completions/mean_terminated_length": 18.775, + "completions/min_length": 16.1, + "completions/min_terminated_length": 16.1, + "epoch": 0.08044554455445545, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.8656837910413742, + "learning_rate": 7.056967234935583e-07, + "loss": 0.0, + "num_tokens": 1681949.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 1170 + }, + { + "completion_length": 24.8, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.8, + "completions/max_terminated_length": 24.8, + "completions/mean_length": 21.35, + "completions/mean_terminated_length": 21.35, + "completions/min_length": 17.9, + "completions/min_terminated_length": 17.9, + "epoch": 0.08113311331133113, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0007500413339585066, + "kl": 0.5578598533757031, + "learning_rate": 6.656753387428089e-07, + "loss": 0.0, + "num_tokens": 1694679.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 1180 + }, + { + "completion_length": 23.2, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.2, + "completions/max_terminated_length": 23.2, + "completions/mean_length": 19.25, + "completions/mean_terminated_length": 19.25, + "completions/min_length": 15.9, + "completions/min_terminated_length": 15.9, + "epoch": 0.08182068206820682, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.6424278654158115, + "learning_rate": 6.266472731555928e-07, + "loss": 0.0, + "num_tokens": 1710533.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 1190 + }, + { + "completion_length": 22.3, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.3, + "completions/max_terminated_length": 22.3, + "completions/mean_length": 18.575, + "completions/mean_terminated_length": 18.575, + "completions/min_length": 15.7, + "completions/min_terminated_length": 15.7, + "epoch": 0.08250825082508251, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.7231093045324087, + "learning_rate": 5.886336611179211e-07, + "loss": 0.0, + "num_tokens": 1725716.0, + "reward": 4.099999904632568, + "reward_std": 0.0, + "rewards/coherence_reward_func/mean": 1.2999999523162842, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 2.0, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 0.800000011920929, + "rewards/quality_reward_func/std": 0.0, + "step": 1200 } ], "logging_steps": 10, "max_steps": 1500, - "num_input_tokens_seen": 277602, + "num_input_tokens_seen": 1725716, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": {