diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -2,9 +2,9 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.014243978130947618, + "epoch": 0.020911372149689057, "eval_steps": 500, - "global_step": 2350, + "global_step": 3450, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -6118,11 +6118,2871 @@ "rewards/quality_reward_func/mean": 1.3368490397930146, "rewards/quality_reward_func/std": 0.13240620964206756, "step": 2350 + }, + { + "completion_length": 33.2, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.2, + "completions/max_terminated_length": 33.2, + "completions/mean_length": 31.125, + "completions/mean_terminated_length": 31.125, + "completions/min_length": 29.3, + "completions/min_terminated_length": 29.3, + "epoch": 0.014304590803845269, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.13280260562896729, + "kl": 0.9531688548624515, + "learning_rate": 3.1739804233685528e-06, + "loss": 0.0001, + "num_tokens": 3048438.0, + "reward": 4.034874534606933, + "reward_std": 0.1207739002071321, + "rewards/coherence_reward_func/mean": 0.9774999976158142, + "rewards/coherence_reward_func/std": 0.015000002086162567, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.5573744893074035, + "rewards/quality_reward_func/std": 0.11691368520259857, + "step": 2360 + }, + { + "completion_length": 37.9, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.9, + "completions/max_terminated_length": 37.9, + "completions/mean_length": 34.6, + "completions/mean_terminated_length": 34.6, + "completions/min_length": 29.7, + "completions/min_terminated_length": 29.7, + "epoch": 0.014365203476742918, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.14214658737182617, + "kl": 0.7467479955404996, + "learning_rate": 3.1571570582470307e-06, + "loss": 0.0, + "num_tokens": 3065962.0, + "reward": 3.5746789455413817, + "reward_std": 0.10260191285051405, + "rewards/coherence_reward_func/mean": 0.9725000023841858, + "rewards/coherence_reward_func/std": 0.03403429687023163, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.102178880572319, + "rewards/quality_reward_func/std": 0.12953687296248972, + "step": 2370 + }, + { + "completion_length": 33.2, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.2, + "completions/max_terminated_length": 33.2, + "completions/mean_length": 32.175, + "completions/mean_terminated_length": 32.175, + "completions/min_length": 31.1, + "completions/min_terminated_length": 31.1, + "epoch": 0.014425816149640567, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.0, + "kl": 0.66828937754035, + "learning_rate": 3.1403016642175993e-06, + "loss": 0.0, + "num_tokens": 3079737.0, + "reward": 3.8128685474395754, + "reward_std": 0.028522996790707113, + "rewards/coherence_reward_func/mean": 0.9699999988079071, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.3428685158491134, + "rewards/quality_reward_func/std": 0.02852298943325877, + "step": 2380 + }, + { + "completion_length": 46.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.0, + "completions/max_terminated_length": 46.0, + "completions/mean_length": 43.675, + "completions/mean_terminated_length": 43.675, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.014486428822538215, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.20936551690101624, + "kl": 0.5772987704724073, + "learning_rate": 3.123415062788385e-06, + "loss": 0.0, + "num_tokens": 3092392.0, + "reward": 4.032087278366089, + "reward_std": 0.08739078380167484, + "rewards/coherence_reward_func/mean": 0.9474999964237213, + "rewards/coherence_reward_func/std": 0.015000002086162567, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.5845872640609742, + "rewards/quality_reward_func/std": 0.09230031631886959, + "step": 2390 + }, + { + "completion_length": 40.9, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.9, + "completions/max_terminated_length": 40.9, + "completions/mean_length": 36.225, + "completions/mean_terminated_length": 36.225, + "completions/min_length": 32.8, + "completions/min_terminated_length": 32.8, + "epoch": 0.014547041495435866, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19811180233955383, + "kl": 0.5662164811044932, + "learning_rate": 3.106498076988519e-06, + "loss": 0.0, + "num_tokens": 3104437.0, + "reward": 3.769843649864197, + "reward_std": 0.22745495121926068, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.2698437094688415, + "rewards/quality_reward_func/std": 0.2274549625813961, + "step": 2400 + }, + { + "completion_length": 46.2, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.2, + "completions/max_terminated_length": 46.2, + "completions/mean_length": 41.15, + "completions/mean_terminated_length": 41.15, + "completions/min_length": 37.3, + "completions/min_terminated_length": 37.3, + "epoch": 0.014607654168333515, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.376869261264801, + "kl": 0.5972482226788998, + "learning_rate": 3.089551531328021e-06, + "loss": 0.0001, + "num_tokens": 3117803.0, + "reward": 3.681988263130188, + "reward_std": 0.19493986666202545, + "rewards/coherence_reward_func/mean": 0.9899999976158143, + "rewards/coherence_reward_func/std": 0.020000000298023225, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.1919882595539093, + "rewards/quality_reward_func/std": 0.17984362840652465, + "step": 2410 + }, + { + "completion_length": 47.9, + "completions/clipped_ratio": 0.0, + "completions/max_length": 47.9, + "completions/max_terminated_length": 47.9, + "completions/mean_length": 45.675, + "completions/mean_terminated_length": 45.675, + "completions/min_length": 42.9, + "completions/min_terminated_length": 42.9, + "epoch": 0.014668266841231164, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.19029180705547333, + "kl": 0.5135284159332514, + "learning_rate": 3.0725762517576197e-06, + "loss": 0.0, + "num_tokens": 3130486.0, + "reward": 3.933048152923584, + "reward_std": 0.05088914311490953, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.4330482244491578, + "rewards/quality_reward_func/std": 0.050889137154445055, + "step": 2420 + }, + { + "completion_length": 40.6, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.6, + "completions/max_terminated_length": 40.6, + "completions/mean_length": 37.2, + "completions/mean_terminated_length": 37.2, + "completions/min_length": 33.6, + "completions/min_terminated_length": 33.6, + "epoch": 0.014728879514128814, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.17371055483818054, + "kl": 0.5341577146202325, + "learning_rate": 3.0555730656284917e-06, + "loss": 0.0, + "num_tokens": 3145170.0, + "reward": 3.8729047536849976, + "reward_std": 0.26118058804422617, + "rewards/coherence_reward_func/mean": 0.9449999988079071, + "rewards/coherence_reward_func/std": 0.05, + "rewards/formatting_reward_func/mean": 1.4625, + "rewards/formatting_reward_func/std": 0.075, + "rewards/quality_reward_func/mean": 1.465404713153839, + "rewards/quality_reward_func/std": 0.13618059642612934, + "step": 2430 + }, + { + "completion_length": 45.1, + "completions/clipped_ratio": 0.0, + "completions/max_length": 45.1, + "completions/max_terminated_length": 45.1, + "completions/mean_length": 41.275, + "completions/mean_terminated_length": 41.275, + "completions/min_length": 38.4, + "completions/min_terminated_length": 38.4, + "epoch": 0.014789492187026463, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.0, + "kl": 0.5556940101087093, + "learning_rate": 3.0385428016519413e-06, + "loss": 0.0, + "num_tokens": 3158737.0, + "reward": 3.77139093875885, + "reward_std": 0.10554739125072957, + "rewards/coherence_reward_func/mean": 0.9699999988079071, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.3013909101486205, + "rewards/quality_reward_func/std": 0.10554737057536841, + "step": 2440 + }, + { + "completion_length": 45.2, + "completions/clipped_ratio": 0.0, + "completions/max_length": 45.2, + "completions/max_terminated_length": 45.2, + "completions/mean_length": 39.375, + "completions/mean_terminated_length": 39.375, + "completions/min_length": 35.2, + "completions/min_terminated_length": 35.2, + "epoch": 0.014850104859924112, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27828526496887207, + "kl": 0.4428048962727189, + "learning_rate": 3.02148628985901e-06, + "loss": 0.0, + "num_tokens": 3170780.0, + "reward": 3.7035243988037108, + "reward_std": 0.19730213293805718, + "rewards/coherence_reward_func/mean": 0.9925000011920929, + "rewards/coherence_reward_func/std": 0.015000002086162567, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.2110244035720825, + "rewards/quality_reward_func/std": 0.18937852047383785, + "step": 2450 + }, + { + "completion_length": 43.2, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.2, + "completions/max_terminated_length": 43.2, + "completions/mean_length": 39.45, + "completions/mean_terminated_length": 39.45, + "completions/min_length": 36.2, + "completions/min_terminated_length": 36.2, + "epoch": 0.014910717532821763, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.1852184385061264, + "kl": 0.5022999217733741, + "learning_rate": 3.0044043615600176e-06, + "loss": 0.0, + "num_tokens": 3185042.0, + "reward": 3.773235487937927, + "reward_std": 0.07152210185304284, + "rewards/coherence_reward_func/mean": 0.9850000023841858, + "rewards/coherence_reward_func/std": 0.017320507764816286, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.2882355391979217, + "rewards/quality_reward_func/std": 0.08858798267319798, + "step": 2460 + }, + { + "completion_length": 46.9, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.9, + "completions/max_terminated_length": 46.9, + "completions/mean_length": 43.825, + "completions/mean_terminated_length": 43.825, + "completions/min_length": 39.7, + "completions/min_terminated_length": 39.7, + "epoch": 0.014971330205719412, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12899985909461975, + "kl": 0.40727804601192474, + "learning_rate": 2.9872978493040517e-06, + "loss": 0.0, + "num_tokens": 3199671.0, + "reward": 3.994355297088623, + "reward_std": 0.07916008960455656, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.4943552970886231, + "rewards/quality_reward_func/std": 0.0791601019911468, + "step": 2470 + }, + { + "completion_length": 41.2, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.2, + "completions/max_terminated_length": 41.2, + "completions/mean_length": 38.1, + "completions/mean_terminated_length": 38.1, + "completions/min_length": 34.9, + "completions/min_terminated_length": 34.9, + "epoch": 0.01503194287861706, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.1691986471414566, + "kl": 0.43959119068458674, + "learning_rate": 2.970167586838385e-06, + "loss": 0.0, + "num_tokens": 3210627.0, + "reward": 3.8784923553466797, + "reward_std": 0.06513606734806672, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.3784924566745758, + "rewards/quality_reward_func/std": 0.06513608191162348, + "step": 2480 + }, + { + "completion_length": 42.8, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.8, + "completions/max_terminated_length": 42.8, + "completions/mean_length": 37.95, + "completions/mean_terminated_length": 37.95, + "completions/min_length": 34.5, + "completions/min_terminated_length": 34.5, + "epoch": 0.015092555551514711, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.16865229606628418, + "kl": 0.5600904650986195, + "learning_rate": 2.9530144090678435e-06, + "loss": 0.0, + "num_tokens": 3223301.0, + "reward": 3.7232309341430665, + "reward_std": 0.11923125218600035, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.2232309579849243, + "rewards/quality_reward_func/std": 0.11923126853071153, + "step": 2490 + }, + { + "completion_length": 42.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 39.325, + "completions/mean_terminated_length": 39.325, + "completions/min_length": 36.1, + "completions/min_terminated_length": 36.1, + "epoch": 0.01515316822441236, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.22221040725708008, + "kl": 0.6170519307255745, + "learning_rate": 2.935839152014112e-06, + "loss": 0.0, + "num_tokens": 3238534.0, + "reward": 3.781364846229553, + "reward_std": 0.11561555415391922, + "rewards/coherence_reward_func/mean": 0.9699999988079071, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.3113647818565368, + "rewards/quality_reward_func/std": 0.11561555750668048, + "step": 2500 + }, + { + "completion_length": 46.4, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.4, + "completions/max_terminated_length": 46.4, + "completions/mean_length": 43.55, + "completions/mean_terminated_length": 43.55, + "completions/min_length": 40.5, + "completions/min_terminated_length": 40.5, + "epoch": 0.015213780897310009, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.21276995539665222, + "kl": 0.5121112320572138, + "learning_rate": 2.918642652774989e-06, + "loss": 0.0, + "num_tokens": 3249820.0, + "reward": 3.8134325742721558, + "reward_std": 0.0658552709966898, + "rewards/coherence_reward_func/mean": 0.9699999988079071, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.3434325933456421, + "rewards/quality_reward_func/std": 0.06585524827241898, + "step": 2510 + }, + { + "completion_length": 44.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.5, + "completions/max_terminated_length": 44.5, + "completions/mean_length": 42.7, + "completions/mean_terminated_length": 42.7, + "completions/min_length": 41.3, + "completions/min_terminated_length": 41.3, + "epoch": 0.01527439357020766, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.3839207589626312, + "kl": 0.3808361187577248, + "learning_rate": 2.9014257494835863e-06, + "loss": 0.0, + "num_tokens": 3262784.0, + "reward": 3.908826160430908, + "reward_std": 0.08951527504250407, + "rewards/coherence_reward_func/mean": 0.9699999988079071, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.4388261198997498, + "rewards/quality_reward_func/std": 0.08951530596241355, + "step": 2520 + }, + { + "completion_length": 42.3, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.3, + "completions/max_terminated_length": 42.3, + "completions/mean_length": 38.75, + "completions/mean_terminated_length": 38.75, + "completions/min_length": 36.3, + "completions/min_terminated_length": 36.3, + "epoch": 0.015335006243105308, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.2821073532104492, + "kl": 0.3947053180076182, + "learning_rate": 2.884189281267481e-06, + "loss": 0.0, + "num_tokens": 3275266.0, + "reward": 4.0385295152664185, + "reward_std": 0.10868711099028587, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.5385294795036315, + "rewards/quality_reward_func/std": 0.10868713408708572, + "step": 2530 + }, + { + "completion_length": 49.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 45.3, + "completions/mean_terminated_length": 45.3, + "completions/min_length": 40.8, + "completions/min_terminated_length": 40.8, + "epoch": 0.015395618916002957, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.14954452216625214, + "kl": 0.568367613106966, + "learning_rate": 2.8669340882078166e-06, + "loss": 0.0001, + "num_tokens": 3289442.0, + "reward": 3.994439148902893, + "reward_std": 0.08471027053892613, + "rewards/coherence_reward_func/mean": 0.9099999964237213, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.5844391584396362, + "rewards/quality_reward_func/std": 0.08471028534695506, + "step": 2540 + }, + { + "completion_length": 39.9, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.9, + "completions/max_terminated_length": 39.9, + "completions/mean_length": 38.975, + "completions/mean_terminated_length": 38.975, + "completions/min_length": 37.8, + "completions/min_terminated_length": 37.8, + "epoch": 0.015456231588900608, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0, + "kl": 0.5600289195775986, + "learning_rate": 2.8496610112983607e-06, + "loss": 0.0, + "num_tokens": 3303761.0, + "reward": 3.9027648687362673, + "reward_std": 0.05694897845387459, + "rewards/coherence_reward_func/mean": 0.9699999988079071, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.4327647507190704, + "rewards/quality_reward_func/std": 0.05694894678890705, + "step": 2550 + }, + { + "completion_length": 45.1, + "completions/clipped_ratio": 0.0, + "completions/max_length": 45.1, + "completions/max_terminated_length": 45.1, + "completions/mean_length": 39.425, + "completions/mean_terminated_length": 39.425, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.015516844261798257, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.17587892711162567, + "kl": 0.5447262158617378, + "learning_rate": 2.8323708924045112e-06, + "loss": 0.0, + "num_tokens": 3318370.0, + "reward": 3.9645920515060427, + "reward_std": 0.1159634368494153, + "rewards/coherence_reward_func/mean": 0.9824999988079071, + "rewards/coherence_reward_func/std": 0.034999999403953555, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.482092034816742, + "rewards/quality_reward_func/std": 0.08113080505281686, + "step": 2560 + }, + { + "completion_length": 49.6, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.6, + "completions/max_terminated_length": 49.6, + "completions/mean_length": 45.925, + "completions/mean_terminated_length": 45.925, + "completions/min_length": 42.2, + "completions/min_terminated_length": 42.2, + "epoch": 0.015577456934695906, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08882153779268265, + "kl": 0.7832069963216781, + "learning_rate": 2.8150645742222716e-06, + "loss": 0.0001, + "num_tokens": 3329487.0, + "reward": 3.984575343132019, + "reward_std": 0.07680719960480928, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.4845752358436584, + "rewards/quality_reward_func/std": 0.07680717501789332, + "step": 2570 + }, + { + "completion_length": 39.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 35.625, + "completions/mean_terminated_length": 35.625, + "completions/min_length": 32.2, + "completions/min_terminated_length": 32.2, + "epoch": 0.015638069607593556, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.42785578966140747, + "kl": 0.6773035958409309, + "learning_rate": 2.797742900237175e-06, + "loss": 0.0, + "num_tokens": 3341460.0, + "reward": 3.8624061584472655, + "reward_std": 0.09481968693435192, + "rewards/coherence_reward_func/mean": 0.9550000011920929, + "rewards/coherence_reward_func/std": 0.017320507764816286, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.4074061989784241, + "rewards/quality_reward_func/std": 0.07804237883538008, + "step": 2580 + }, + { + "completion_length": 35.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 33.425, + "completions/mean_terminated_length": 33.425, + "completions/min_length": 32.4, + "completions/min_terminated_length": 32.4, + "epoch": 0.015698682280491205, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0, + "kl": 0.6863901816308499, + "learning_rate": 2.7804067146831724e-06, + "loss": 0.0, + "num_tokens": 3354581.0, + "reward": 3.9445890188217163, + "reward_std": 0.029083981364965438, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.4445890128612517, + "rewards/quality_reward_func/std": 0.029084013029932975, + "step": 2590 + }, + { + "completion_length": 42.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.5, + "completions/max_terminated_length": 42.5, + "completions/mean_length": 40.625, + "completions/mean_terminated_length": 40.625, + "completions/min_length": 39.1, + "completions/min_terminated_length": 39.1, + "epoch": 0.015759294953388854, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.2602120339870453, + "kl": 0.7010997839272022, + "learning_rate": 2.763056862501492e-06, + "loss": 0.0001, + "num_tokens": 3362930.0, + "reward": 4.203077960014343, + "reward_std": 0.05241375220939517, + "rewards/coherence_reward_func/mean": 0.9699999988079071, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.7330780267715453, + "rewards/quality_reward_func/std": 0.052413776610046627, + "step": 2600 + }, + { + "completion_length": 43.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 40.05, + "completions/mean_terminated_length": 40.05, + "completions/min_length": 37.4, + "completions/min_terminated_length": 37.4, + "epoch": 0.015819907626286503, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.17323555052280426, + "kl": 0.7230022266507149, + "learning_rate": 2.7456941892994497e-06, + "loss": 0.0001, + "num_tokens": 3378688.0, + "reward": 3.926683044433594, + "reward_std": 0.047219998016953466, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.4266830563545227, + "rewards/quality_reward_func/std": 0.04721996132284403, + "step": 2610 + }, + { + "completion_length": 37.1, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.1, + "completions/max_terminated_length": 37.1, + "completions/mean_length": 35.275, + "completions/mean_terminated_length": 35.275, + "completions/min_length": 33.5, + "completions/min_terminated_length": 33.5, + "epoch": 0.015880520299184152, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.27321839332580566, + "kl": 0.5179006870836019, + "learning_rate": 2.7283195413092444e-06, + "loss": 0.0, + "num_tokens": 3390719.0, + "reward": 3.8513688802719117, + "reward_std": 0.17389362622052432, + "rewards/coherence_reward_func/mean": 0.975, + "rewards/coherence_reward_func/std": 0.05, + "rewards/formatting_reward_func/mean": 1.4875, + "rewards/formatting_reward_func/std": 0.025, + "rewards/quality_reward_func/mean": 1.3888689264655114, + "rewards/quality_reward_func/std": 0.09925779986660928, + "step": 2620 + }, + { + "completion_length": 39.1, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.1, + "completions/max_terminated_length": 39.1, + "completions/mean_length": 36.675, + "completions/mean_terminated_length": 36.675, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.015941132972081804, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.25069549679756165, + "kl": 0.7381820097565651, + "learning_rate": 2.7109337653467072e-06, + "loss": 0.0001, + "num_tokens": 3405230.0, + "reward": 3.8772063493728637, + "reward_std": 0.10762457083910704, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.3772063851356506, + "rewards/quality_reward_func/std": 0.10762461256235838, + "step": 2630 + }, + { + "completion_length": 35.4, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.4, + "completions/max_terminated_length": 35.4, + "completions/mean_length": 33.6, + "completions/mean_terminated_length": 33.6, + "completions/min_length": 31.2, + "completions/min_terminated_length": 31.2, + "epoch": 0.016001745644979453, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.2519443929195404, + "kl": 1.0420048169791698, + "learning_rate": 2.6935377087700297e-06, + "loss": 0.0001, + "num_tokens": 3416838.0, + "reward": 3.91592059135437, + "reward_std": 0.06143229096196592, + "rewards/coherence_reward_func/mean": 0.9199999988079071, + "rewards/coherence_reward_func/std": 0.020000000298023225, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.4959205716848374, + "rewards/quality_reward_func/std": 0.04624048583209515, + "step": 2640 + }, + { + "completion_length": 45.4, + "completions/clipped_ratio": 0.0, + "completions/max_length": 45.4, + "completions/max_terminated_length": 45.4, + "completions/mean_length": 42.575, + "completions/mean_terminated_length": 42.575, + "completions/min_length": 39.4, + "completions/min_terminated_length": 39.4, + "epoch": 0.016062358317877102, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.0, + "kl": 0.6294115573167801, + "learning_rate": 2.6761322194384676e-06, + "loss": 0.0001, + "num_tokens": 3430097.0, + "reward": 3.9316094875335694, + "reward_std": 0.09293769309297203, + "rewards/coherence_reward_func/mean": 0.9699999988079071, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.4616094708442688, + "rewards/quality_reward_func/std": 0.09293768610805273, + "step": 2650 + }, + { + "completion_length": 43.1, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.1, + "completions/max_terminated_length": 43.1, + "completions/mean_length": 39.025, + "completions/mean_terminated_length": 39.025, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.01612297099077475, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.0, + "kl": 0.7926542639732361, + "learning_rate": 2.6587181456710154e-06, + "loss": 0.0001, + "num_tokens": 3441302.0, + "reward": 3.769076681137085, + "reward_std": 0.03707691185409203, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.2690767049789429, + "rewards/quality_reward_func/std": 0.03707688362337649, + "step": 2660 + }, + { + "completion_length": 42.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 39.775, + "completions/mean_terminated_length": 39.775, + "completions/min_length": 36.7, + "completions/min_terminated_length": 36.7, + "epoch": 0.0161835836636724, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.4314729869365692, + "kl": 0.7939471632242203, + "learning_rate": 2.641296336205062e-06, + "loss": 0.0001, + "num_tokens": 3455385.0, + "reward": 3.7821798801422117, + "reward_std": 0.104853530600667, + "rewards/coherence_reward_func/mean": 0.9925000011920929, + "rewards/coherence_reward_func/std": 0.015000002086162567, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.289679890871048, + "rewards/quality_reward_func/std": 0.09481636472046376, + "step": 2670 + }, + { + "completion_length": 37.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 35.475, + "completions/mean_terminated_length": 35.475, + "completions/min_length": 34.2, + "completions/min_terminated_length": 34.2, + "epoch": 0.01624419633657005, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.33191776275634766, + "kl": 0.5365656912326813, + "learning_rate": 2.6238676401550205e-06, + "loss": 0.0, + "num_tokens": 3464440.0, + "reward": 4.1119883298873905, + "reward_std": 0.06921507436782122, + "rewards/coherence_reward_func/mean": 0.9699999988079071, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.641988343000412, + "rewards/quality_reward_func/std": 0.06921505890786647, + "step": 2680 + }, + { + "completion_length": 38.4, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.4, + "completions/max_terminated_length": 38.4, + "completions/mean_length": 37.25, + "completions/mean_terminated_length": 37.25, + "completions/min_length": 35.7, + "completions/min_terminated_length": 35.7, + "epoch": 0.0163048090094677, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.09902704507112503, + "kl": 0.6591603338718415, + "learning_rate": 2.6064329069709495e-06, + "loss": 0.0, + "num_tokens": 3479326.0, + "reward": 4.098807001113892, + "reward_std": 0.04632377550005913, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.5988069772720337, + "rewards/quality_reward_func/std": 0.046323776338249446, + "step": 2690 + }, + { + "completion_length": 37.2, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.2, + "completions/max_terminated_length": 37.2, + "completions/mean_length": 34.825, + "completions/mean_terminated_length": 34.825, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.01636542168236535, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.0, + "kl": 0.7738334469497203, + "learning_rate": 2.5889929863971465e-06, + "loss": 0.0001, + "num_tokens": 3495171.0, + "reward": 3.901769185066223, + "reward_std": 0.07665818370878696, + "rewards/coherence_reward_func/mean": 0.9925000011920929, + "rewards/coherence_reward_func/std": 0.015000002086162567, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.409269118309021, + "rewards/quality_reward_func/std": 0.0794066557660699, + "step": 2700 + }, + { + "completion_length": 45.6, + "completions/clipped_ratio": 0.0, + "completions/max_length": 45.6, + "completions/max_terminated_length": 45.6, + "completions/mean_length": 40.875, + "completions/mean_terminated_length": 40.875, + "completions/min_length": 34.5, + "completions/min_terminated_length": 34.5, + "epoch": 0.016426034355263, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.0, + "kl": 0.5373141642659902, + "learning_rate": 2.571548728430737e-06, + "loss": 0.0, + "num_tokens": 3507754.0, + "reward": 3.834697890281677, + "reward_std": 0.06779527999460697, + "rewards/coherence_reward_func/mean": 0.9600000023841858, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.374697893857956, + "rewards/quality_reward_func/std": 0.06779528856277466, + "step": 2710 + }, + { + "completion_length": 38.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.5, + "completions/max_terminated_length": 38.5, + "completions/mean_length": 37.7, + "completions/mean_terminated_length": 37.7, + "completions/min_length": 37.2, + "completions/min_terminated_length": 37.2, + "epoch": 0.016486647028160648, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.40601110458374023, + "kl": 0.8498925857245923, + "learning_rate": 2.5541009832802448e-06, + "loss": 0.0001, + "num_tokens": 3522894.0, + "reward": 4.105284094810486, + "reward_std": 0.04365409443853423, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.6052841067314148, + "rewards/quality_reward_func/std": 0.043654035578947516, + "step": 2720 + }, + { + "completion_length": 47.1, + "completions/clipped_ratio": 0.0, + "completions/max_length": 47.1, + "completions/max_terminated_length": 47.1, + "completions/mean_length": 42.1, + "completions/mean_terminated_length": 42.1, + "completions/min_length": 38.1, + "completions/min_terminated_length": 38.1, + "epoch": 0.016547259701058296, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.3551698923110962, + "kl": 0.7001808919012547, + "learning_rate": 2.536650601324152e-06, + "loss": 0.0001, + "num_tokens": 3534554.0, + "reward": 4.1095947265625, + "reward_std": 0.13604998160153628, + "rewards/coherence_reward_func/mean": 0.9699999988079071, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.6395947456359863, + "rewards/quality_reward_func/std": 0.13604997415095568, + "step": 2730 + }, + { + "completion_length": 34.3, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.3, + "completions/max_terminated_length": 34.3, + "completions/mean_length": 32.975, + "completions/mean_terminated_length": 32.975, + "completions/min_length": 31.3, + "completions/min_terminated_length": 31.3, + "epoch": 0.016607872373955945, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.13214418292045593, + "kl": 0.7773146666586399, + "learning_rate": 2.5191984330694576e-06, + "loss": 0.0001, + "num_tokens": 3546873.0, + "reward": 3.9706292390823363, + "reward_std": 0.06375699776108376, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.4706292569637298, + "rewards/quality_reward_func/std": 0.06375699583441019, + "step": 2740 + }, + { + "completion_length": 30.9, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.9, + "completions/max_terminated_length": 30.9, + "completions/mean_length": 29.325, + "completions/mean_terminated_length": 29.325, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.016668485046853598, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.0, + "kl": 0.811890983581543, + "learning_rate": 2.501745329110219e-06, + "loss": 0.0, + "num_tokens": 3560378.0, + "reward": 3.628898596763611, + "reward_std": 0.10628243210958317, + "rewards/coherence_reward_func/mean": 0.8399999976158142, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.2888986378908158, + "rewards/quality_reward_func/std": 0.10628242962993681, + "step": 2750 + }, + { + "completion_length": 45.2, + "completions/clipped_ratio": 0.0, + "completions/max_length": 45.2, + "completions/max_terminated_length": 45.2, + "completions/mean_length": 41.5, + "completions/mean_terminated_length": 41.5, + "completions/min_length": 38.3, + "completions/min_terminated_length": 38.3, + "epoch": 0.016729097719751247, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.3302355110645294, + "kl": 0.6666261859238147, + "learning_rate": 2.484292140086103e-06, + "loss": 0.0001, + "num_tokens": 3575330.0, + "reward": 3.565551996231079, + "reward_std": 0.0728124035988003, + "rewards/coherence_reward_func/mean": 0.9824999988079071, + "rewards/coherence_reward_func/std": 0.034999999403953555, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.0830519676208497, + "rewards/quality_reward_func/std": 0.03782532922923565, + "step": 2760 + }, + { + "completion_length": 45.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 45.0, + "completions/max_terminated_length": 45.0, + "completions/mean_length": 42.75, + "completions/mean_terminated_length": 42.75, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.016789710392648895, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.1726081371307373, + "kl": 0.6724840953946114, + "learning_rate": 2.4668397166409184e-06, + "loss": 0.0001, + "num_tokens": 3589156.0, + "reward": 3.8185994148254396, + "reward_std": 0.038681184966117145, + "rewards/coherence_reward_func/mean": 0.9599999964237214, + "rewards/coherence_reward_func/std": 0.020000000298023225, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.358599418401718, + "rewards/quality_reward_func/std": 0.044405206956434996, + "step": 2770 + }, + { + "completion_length": 38.7, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.7, + "completions/max_terminated_length": 38.7, + "completions/mean_length": 37.225, + "completions/mean_terminated_length": 37.225, + "completions/min_length": 35.8, + "completions/min_terminated_length": 35.8, + "epoch": 0.016850323065546544, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.13680297136306763, + "kl": 0.7626109138131142, + "learning_rate": 2.4493889093811624e-06, + "loss": 0.0001, + "num_tokens": 3599309.0, + "reward": 3.9401774168014527, + "reward_std": 0.04904340072534978, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.4401773750782012, + "rewards/quality_reward_func/std": 0.049043377256020905, + "step": 2780 + }, + { + "completion_length": 53.2, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.2, + "completions/max_terminated_length": 53.2, + "completions/mean_length": 47.65, + "completions/mean_terminated_length": 47.65, + "completions/min_length": 41.4, + "completions/min_terminated_length": 41.4, + "epoch": 0.016910935738444193, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.16031678020954132, + "kl": 0.47936664558947084, + "learning_rate": 2.4319405688345614e-06, + "loss": 0.0001, + "num_tokens": 3610687.0, + "reward": 3.7313020706176756, + "reward_std": 0.17685027779079973, + "rewards/coherence_reward_func/mean": 0.9400000005960465, + "rewards/coherence_reward_func/std": 0.05000000149011612, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.291302090883255, + "rewards/quality_reward_func/std": 0.12713201008737088, + "step": 2790 + }, + { + "completion_length": 35.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.5, + "completions/max_terminated_length": 35.5, + "completions/mean_length": 34.0, + "completions/mean_terminated_length": 34.0, + "completions/min_length": 32.8, + "completions/min_terminated_length": 32.8, + "epoch": 0.016971548411341842, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.1880660057067871, + "kl": 0.5243588045239449, + "learning_rate": 2.414495545408619e-06, + "loss": 0.0, + "num_tokens": 3623799.0, + "reward": 3.986675810813904, + "reward_std": 0.07717739315703512, + "rewards/coherence_reward_func/mean": 0.9699999988079071, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.5166758596897125, + "rewards/quality_reward_func/std": 0.07717741429805755, + "step": 2800 + }, + { + "completion_length": 47.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 47.0, + "completions/max_terminated_length": 47.0, + "completions/mean_length": 43.2, + "completions/mean_terminated_length": 43.2, + "completions/min_length": 38.7, + "completions/min_terminated_length": 38.7, + "epoch": 0.017032161084239494, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.33259984850883484, + "kl": 0.48235350362956525, + "learning_rate": 2.3970546893491637e-06, + "loss": 0.0, + "num_tokens": 3636703.0, + "reward": 3.8235735654830934, + "reward_std": 0.10343162054196, + "rewards/coherence_reward_func/mean": 0.9774999976158142, + "rewards/coherence_reward_func/std": 0.015000002086162567, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.3460735499858856, + "rewards/quality_reward_func/std": 0.10343165006488561, + "step": 2810 + }, + { + "completion_length": 34.9, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.9, + "completions/max_terminated_length": 34.9, + "completions/mean_length": 33.475, + "completions/mean_terminated_length": 33.475, + "completions/min_length": 31.1, + "completions/min_terminated_length": 31.1, + "epoch": 0.017092773757137143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17236274480819702, + "kl": 0.6768838051706553, + "learning_rate": 2.3796188506989153e-06, + "loss": 0.0, + "num_tokens": 3649826.0, + "reward": 3.8158772706985475, + "reward_std": 0.06938843043753877, + "rewards/coherence_reward_func/mean": 0.9699999988079071, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.3458772778511048, + "rewards/quality_reward_func/std": 0.0693884058156982, + "step": 2820 + }, + { + "completion_length": 41.8, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.8, + "completions/max_terminated_length": 41.8, + "completions/mean_length": 38.125, + "completions/mean_terminated_length": 38.125, + "completions/min_length": 35.2, + "completions/min_terminated_length": 35.2, + "epoch": 0.017153386430034792, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.23252420127391815, + "kl": 0.8006950195878744, + "learning_rate": 2.3621888792560517e-06, + "loss": 0.0001, + "num_tokens": 3661951.0, + "reward": 3.8711918115615847, + "reward_std": 0.06124148964881897, + "rewards/coherence_reward_func/mean": 0.9699999988079071, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.4011918485164643, + "rewards/quality_reward_func/std": 0.06124151721596718, + "step": 2830 + }, + { + "completion_length": 40.1, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.1, + "completions/max_terminated_length": 40.1, + "completions/mean_length": 37.4, + "completions/mean_terminated_length": 37.4, + "completions/min_length": 35.7, + "completions/min_terminated_length": 35.7, + "epoch": 0.01721399910293244, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.15335425734519958, + "kl": 0.7628875311464072, + "learning_rate": 2.3447656245327903e-06, + "loss": 0.0001, + "num_tokens": 3674359.0, + "reward": 3.9296700239181517, + "reward_std": 0.053428084636107084, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.4296700298786162, + "rewards/quality_reward_func/std": 0.05342807814013213, + "step": 2840 + }, + { + "completion_length": 54.1, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.1, + "completions/max_terminated_length": 54.1, + "completions/mean_length": 47.575, + "completions/mean_terminated_length": 47.575, + "completions/min_length": 42.9, + "completions/min_terminated_length": 42.9, + "epoch": 0.01727461177583009, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.0, + "kl": 0.44681017491966485, + "learning_rate": 2.327349935713986e-06, + "loss": 0.0, + "num_tokens": 3684542.0, + "reward": 3.7153425455093383, + "reward_std": 0.1435070670908317, + "rewards/coherence_reward_func/mean": 0.9424999952316284, + "rewards/coherence_reward_func/std": 0.08500000089406967, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.272842526435852, + "rewards/quality_reward_func/std": 0.0679278950439766, + "step": 2850 + }, + { + "completion_length": 39.3, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.3, + "completions/max_terminated_length": 39.3, + "completions/mean_length": 37.45, + "completions/mean_terminated_length": 37.45, + "completions/min_length": 35.4, + "completions/min_terminated_length": 35.4, + "epoch": 0.01733522444872774, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.3788013458251953, + "kl": 0.6096105173230171, + "learning_rate": 2.309942661615742e-06, + "loss": 0.0, + "num_tokens": 3696104.0, + "reward": 4.1152328729629515, + "reward_std": 0.07895004339516162, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.6152329444885254, + "rewards/quality_reward_func/std": 0.07895006146281958, + "step": 2860 + }, + { + "completion_length": 37.2, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.2, + "completions/max_terminated_length": 37.2, + "completions/mean_length": 36.325, + "completions/mean_terminated_length": 36.325, + "completions/min_length": 34.7, + "completions/min_terminated_length": 34.7, + "epoch": 0.01739583712162539, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.2079620063304901, + "kl": 0.6374891117215157, + "learning_rate": 2.2925446506440403e-06, + "loss": 0.0, + "num_tokens": 3709281.0, + "reward": 3.604074263572693, + "reward_std": 0.09577681496739388, + "rewards/coherence_reward_func/mean": 0.9699999988079071, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.13407421708107, + "rewards/quality_reward_func/std": 0.09577685054391623, + "step": 2870 + }, + { + "completion_length": 44.4, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.4, + "completions/max_terminated_length": 44.4, + "completions/mean_length": 41.075, + "completions/mean_terminated_length": 41.075, + "completions/min_length": 37.8, + "completions/min_terminated_length": 37.8, + "epoch": 0.01745644979452304, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.0, + "kl": 0.6280573938041926, + "learning_rate": 2.2751567507533908e-06, + "loss": 0.0, + "num_tokens": 3720536.0, + "reward": 3.911236310005188, + "reward_std": 0.09300461623352022, + "rewards/coherence_reward_func/mean": 0.9774999976158142, + "rewards/coherence_reward_func/std": 0.015000002086162567, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.433736264705658, + "rewards/quality_reward_func/std": 0.08958076136186718, + "step": 2880 + }, + { + "completion_length": 44.1, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.1, + "completions/max_terminated_length": 44.1, + "completions/mean_length": 42.375, + "completions/mean_terminated_length": 42.375, + "completions/min_length": 40.1, + "completions/min_terminated_length": 40.1, + "epoch": 0.01751706246742069, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.0, + "kl": 0.65657629519701, + "learning_rate": 2.2577798094055028e-06, + "loss": 0.0001, + "num_tokens": 3734043.0, + "reward": 3.9169987440109253, + "reward_std": 0.09950992427766323, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.4169987499713899, + "rewards/quality_reward_func/std": 0.09950996097177267, + "step": 2890 + }, + { + "completion_length": 44.6, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.6, + "completions/max_terminated_length": 44.6, + "completions/mean_length": 40.075, + "completions/mean_terminated_length": 40.075, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.017577675140318338, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.19218558073043823, + "kl": 0.4676514953374863, + "learning_rate": 2.2404146735279823e-06, + "loss": 0.0, + "num_tokens": 3748242.0, + "reward": 3.964124250411987, + "reward_std": 0.06019405350089073, + "rewards/coherence_reward_func/mean": 0.9699999988079071, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.4941242456436157, + "rewards/quality_reward_func/std": 0.060194038599729535, + "step": 2900 + }, + { + "completion_length": 31.9, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.9, + "completions/max_terminated_length": 31.9, + "completions/mean_length": 31.75, + "completions/mean_terminated_length": 31.75, + "completions/min_length": 31.4, + "completions/min_terminated_length": 31.4, + "epoch": 0.017638287813215987, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0, + "kl": 0.6983957252814434, + "learning_rate": 2.223062189473054e-06, + "loss": 0.0, + "num_tokens": 3760236.0, + "reward": 3.952869749069214, + "reward_std": 0.014002268150215968, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.4528697073459624, + "rewards/quality_reward_func/std": 0.0140022435458377, + "step": 2910 + }, + { + "completion_length": 41.6, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.6, + "completions/max_terminated_length": 41.6, + "completions/mean_length": 39.8, + "completions/mean_terminated_length": 39.8, + "completions/min_length": 37.4, + "completions/min_terminated_length": 37.4, + "epoch": 0.017698900486113636, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.18282218277454376, + "kl": 0.4654235543683171, + "learning_rate": 2.2057232029763092e-06, + "loss": 0.0, + "num_tokens": 3770848.0, + "reward": 3.6577728748321534, + "reward_std": 0.0783234752714634, + "rewards/coherence_reward_func/mean": 0.9699999988079071, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.187772911787033, + "rewards/quality_reward_func/std": 0.07832347678486258, + "step": 2920 + }, + { + "completion_length": 33.3, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.3, + "completions/max_terminated_length": 33.3, + "completions/mean_length": 31.725, + "completions/mean_terminated_length": 31.725, + "completions/min_length": 30.7, + "completions/min_terminated_length": 30.7, + "epoch": 0.017759513159011284, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.1789316087961197, + "kl": 0.6979955821298063, + "learning_rate": 2.188398559115489e-06, + "loss": 0.0, + "num_tokens": 3783945.0, + "reward": 3.916050124168396, + "reward_std": 0.14435958303511143, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.4160500228405, + "rewards/quality_reward_func/std": 0.14435954354703426, + "step": 2930 + }, + { + "completion_length": 40.8, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.8, + "completions/max_terminated_length": 40.8, + "completions/mean_length": 38.725, + "completions/mean_terminated_length": 38.725, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.017820125831908937, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.27817022800445557, + "kl": 0.5846670583821834, + "learning_rate": 2.171089102269294e-06, + "loss": 0.0, + "num_tokens": 3796002.0, + "reward": 3.9199183702468874, + "reward_std": 0.07633975064381956, + "rewards/coherence_reward_func/mean": 0.9699999988079071, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.4499184250831605, + "rewards/quality_reward_func/std": 0.07633973751217127, + "step": 2940 + }, + { + "completion_length": 46.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.5, + "completions/max_terminated_length": 46.5, + "completions/mean_length": 41.75, + "completions/mean_terminated_length": 41.75, + "completions/min_length": 38.2, + "completions/min_terminated_length": 38.2, + "epoch": 0.017880738504806586, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.0, + "kl": 0.6097869090735912, + "learning_rate": 2.1537956760762296e-06, + "loss": 0.0, + "num_tokens": 3806716.0, + "reward": 3.9926924228668215, + "reward_std": 0.11543759661726653, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.4926924526691436, + "rewards/quality_reward_func/std": 0.11543761247303337, + "step": 2950 + }, + { + "completion_length": 40.7, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.7, + "completions/max_terminated_length": 40.7, + "completions/mean_length": 38.825, + "completions/mean_terminated_length": 38.825, + "completions/min_length": 35.9, + "completions/min_terminated_length": 35.9, + "epoch": 0.017941351177704234, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.11390696465969086, + "kl": 0.6256432753056288, + "learning_rate": 2.136519123393493e-06, + "loss": 0.0, + "num_tokens": 3820641.0, + "reward": 4.011971044540405, + "reward_std": 0.08256426015868783, + "rewards/coherence_reward_func/mean": 0.9600000023841858, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.5519711196422576, + "rewards/quality_reward_func/std": 0.08256429834291339, + "step": 2960 + }, + { + "completion_length": 43.8, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.8, + "completions/max_terminated_length": 43.8, + "completions/mean_length": 38.875, + "completions/mean_terminated_length": 38.875, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.018001963850601883, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.35060346126556396, + "kl": 0.6076034534722566, + "learning_rate": 2.1192602862558864e-06, + "loss": 0.0, + "num_tokens": 3833028.0, + "reward": 3.6884315967559815, + "reward_std": 0.12104190215468406, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.188431590795517, + "rewards/quality_reward_func/std": 0.1210419088602066, + "step": 2970 + }, + { + "completion_length": 52.4, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.4, + "completions/max_terminated_length": 52.4, + "completions/mean_length": 45.85, + "completions/mean_terminated_length": 45.85, + "completions/min_length": 39.3, + "completions/min_terminated_length": 39.3, + "epoch": 0.018062576523499532, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.3901742398738861, + "kl": 0.5164080902934074, + "learning_rate": 2.1020200058347836e-06, + "loss": 0.0, + "num_tokens": 3845054.0, + "reward": 3.76722936630249, + "reward_std": 0.08552623242139816, + "rewards/coherence_reward_func/mean": 0.9399999976158142, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.3272293865680695, + "rewards/quality_reward_func/std": 0.08552621733397245, + "step": 2980 + }, + { + "completion_length": 51.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 51.5, + "completions/max_terminated_length": 51.5, + "completions/mean_length": 47.825, + "completions/mean_terminated_length": 47.825, + "completions/min_length": 43.5, + "completions/min_terminated_length": 43.5, + "epoch": 0.01812318919639718, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5072823166847229, + "kl": 0.5146443182602525, + "learning_rate": 2.0847991223971305e-06, + "loss": 0.0, + "num_tokens": 3856715.0, + "reward": 3.7614234685897827, + "reward_std": 0.16738499663770198, + "rewards/coherence_reward_func/mean": 0.9699999988079071, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.2914234220981597, + "rewards/quality_reward_func/std": 0.16738500162027775, + "step": 2990 + }, + { + "completion_length": 48.7, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.7, + "completions/max_terminated_length": 48.7, + "completions/mean_length": 42.675, + "completions/mean_terminated_length": 42.675, + "completions/min_length": 39.2, + "completions/min_terminated_length": 39.2, + "epoch": 0.018183801869294833, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.23874402046203613, + "kl": 0.4912438288331032, + "learning_rate": 2.067598475264491e-06, + "loss": 0.0, + "num_tokens": 3869474.0, + "reward": 4.022751498222351, + "reward_std": 0.03978296392597258, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.5227514982223511, + "rewards/quality_reward_func/std": 0.039782968908548356, + "step": 3000 + }, + { + "completion_length": 39.3, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.3, + "completions/max_terminated_length": 39.3, + "completions/mean_length": 37.625, + "completions/mean_terminated_length": 37.625, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.018244414542192482, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.17149239778518677, + "kl": 0.7874499909579754, + "learning_rate": 2.0504189027721396e-06, + "loss": 0.0001, + "num_tokens": 3883319.0, + "reward": 3.956950831413269, + "reward_std": 0.05743742329068482, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.4569508492946626, + "rewards/quality_reward_func/std": 0.057437406154349446, + "step": 3010 + }, + { + "completion_length": 46.7, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.7, + "completions/max_terminated_length": 46.7, + "completions/mean_length": 44.225, + "completions/mean_terminated_length": 44.225, + "completions/min_length": 41.6, + "completions/min_terminated_length": 41.6, + "epoch": 0.01830502721509013, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.43216586112976074, + "kl": 0.8041438281536102, + "learning_rate": 2.033261242228203e-06, + "loss": 0.0001, + "num_tokens": 3898256.0, + "reward": 4.001222062110901, + "reward_std": 0.30263486690819263, + "rewards/coherence_reward_func/mean": 0.9224999964237213, + "rewards/coherence_reward_func/std": 0.06500000208616256, + "rewards/formatting_reward_func/mean": 1.4625, + "rewards/formatting_reward_func/std": 0.075, + "rewards/quality_reward_func/mean": 1.61622211933136, + "rewards/quality_reward_func/std": 0.16852968987077474, + "step": 3020 + }, + { + "completion_length": 41.3, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.3, + "completions/max_terminated_length": 41.3, + "completions/mean_length": 37.675, + "completions/mean_terminated_length": 37.675, + "completions/min_length": 34.7, + "completions/min_terminated_length": 34.7, + "epoch": 0.01836563988798778, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.18524278700351715, + "kl": 0.5940576240420341, + "learning_rate": 2.0161263298728494e-06, + "loss": 0.0, + "num_tokens": 3908251.0, + "reward": 3.921669435501099, + "reward_std": 0.08791780807077884, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.4216694355010986, + "rewards/quality_reward_func/std": 0.08791773918783292, + "step": 3030 + }, + { + "completion_length": 38.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 35.875, + "completions/mean_terminated_length": 35.875, + "completions/min_length": 33.8, + "completions/min_terminated_length": 33.8, + "epoch": 0.01842625256088543, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.2265503704547882, + "kl": 0.8165366999804974, + "learning_rate": 1.9990150008375348e-06, + "loss": 0.0001, + "num_tokens": 3920502.0, + "reward": 3.9495574712753294, + "reward_std": 0.055046566482633355, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.4495575547218322, + "rewards/quality_reward_func/std": 0.055046635866165164, + "step": 3040 + }, + { + "completion_length": 35.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 33.275, + "completions/mean_terminated_length": 33.275, + "completions/min_length": 30.8, + "completions/min_terminated_length": 30.8, + "epoch": 0.018486865233783078, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.1463969200849533, + "kl": 0.8312926575541496, + "learning_rate": 1.981928089104294e-06, + "loss": 0.0, + "num_tokens": 3933233.0, + "reward": 3.618661141395569, + "reward_std": 0.0848767876625061, + "rewards/coherence_reward_func/mean": 0.9399999976158142, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.17866108417511, + "rewards/quality_reward_func/std": 0.08487678728997708, + "step": 3050 + }, + { + "completion_length": 42.2, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.2, + "completions/max_terminated_length": 42.2, + "completions/mean_length": 38.825, + "completions/mean_terminated_length": 38.825, + "completions/min_length": 36.4, + "completions/min_terminated_length": 36.4, + "epoch": 0.01854747790668073, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.0, + "kl": 0.7010526616126299, + "learning_rate": 1.9648664274651e-06, + "loss": 0.0001, + "num_tokens": 3947194.0, + "reward": 3.801596736907959, + "reward_std": 0.07207065261900425, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.3015966907143592, + "rewards/quality_reward_func/std": 0.07207060419023037, + "step": 3060 + }, + { + "completion_length": 38.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.5, + "completions/max_terminated_length": 38.5, + "completions/mean_length": 37.425, + "completions/mean_terminated_length": 37.425, + "completions/min_length": 35.5, + "completions/min_terminated_length": 35.5, + "epoch": 0.01860809057957838, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.17187735438346863, + "kl": 0.7273755632340908, + "learning_rate": 1.947830847481271e-06, + "loss": 0.0001, + "num_tokens": 3962491.0, + "reward": 3.861338996887207, + "reward_std": 0.06818611929193139, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.3613389432430267, + "rewards/quality_reward_func/std": 0.06818618662655354, + "step": 3070 + }, + { + "completion_length": 35.7, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.7, + "completions/max_terminated_length": 35.7, + "completions/mean_length": 32.9, + "completions/mean_terminated_length": 32.9, + "completions/min_length": 30.9, + "completions/min_terminated_length": 30.9, + "epoch": 0.018668703252476028, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.16616536676883698, + "kl": 0.7653290726244449, + "learning_rate": 1.93082217944294e-06, + "loss": 0.0001, + "num_tokens": 3975243.0, + "reward": 3.9271071672439577, + "reward_std": 0.03739016959443688, + "rewards/coherence_reward_func/mean": 0.9850000023841858, + "rewards/coherence_reward_func/std": 0.017320507764816286, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.4421072006225586, + "rewards/quality_reward_func/std": 0.05399639131501317, + "step": 3080 + }, + { + "completion_length": 40.2, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.2, + "completions/max_terminated_length": 40.2, + "completions/mean_length": 38.75, + "completions/mean_terminated_length": 38.75, + "completions/min_length": 36.4, + "completions/min_terminated_length": 36.4, + "epoch": 0.018729315925373677, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.0, + "kl": 0.7989107012748718, + "learning_rate": 1.9138412523285937e-06, + "loss": 0.0001, + "num_tokens": 3992245.0, + "reward": 3.9924602270126344, + "reward_std": 0.10027031376957893, + "rewards/coherence_reward_func/mean": 0.9925000011920929, + "rewards/coherence_reward_func/std": 0.015000002086162567, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.49996018409729, + "rewards/quality_reward_func/std": 0.08873173072934151, + "step": 3090 + }, + { + "completion_length": 40.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 37.675, + "completions/mean_terminated_length": 37.675, + "completions/min_length": 35.8, + "completions/min_terminated_length": 35.8, + "epoch": 0.018789928598271326, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.1242041140794754, + "kl": 0.7136042453348637, + "learning_rate": 1.8968888937646624e-06, + "loss": 0.0001, + "num_tokens": 4006452.0, + "reward": 3.7888214111328127, + "reward_std": 0.08842599894851447, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.2888214111328125, + "rewards/quality_reward_func/std": 0.08842603755183517, + "step": 3100 + }, + { + "completion_length": 44.7, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.7, + "completions/max_terminated_length": 44.7, + "completions/mean_length": 41.075, + "completions/mean_terminated_length": 41.075, + "completions/min_length": 38.1, + "completions/min_terminated_length": 38.1, + "epoch": 0.018850541271168975, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.0, + "kl": 0.605262978747487, + "learning_rate": 1.879965929985187e-06, + "loss": 0.0, + "num_tokens": 4020499.0, + "reward": 3.834125018119812, + "reward_std": 0.12141236998140811, + "rewards/coherence_reward_func/mean": 0.9699999988079071, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.3641250669956206, + "rewards/quality_reward_func/std": 0.12141232704743743, + "step": 3110 + }, + { + "completion_length": 43.4, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.4, + "completions/max_terminated_length": 43.4, + "completions/mean_length": 39.7, + "completions/mean_terminated_length": 39.7, + "completions/min_length": 36.8, + "completions/min_terminated_length": 36.8, + "epoch": 0.018911153944066627, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.26001664996147156, + "kl": 0.645774920284748, + "learning_rate": 1.8630731857915451e-06, + "loss": 0.0, + "num_tokens": 4034619.0, + "reward": 3.571972608566284, + "reward_std": 0.2117477380670607, + "rewards/coherence_reward_func/mean": 0.925, + "rewards/coherence_reward_func/std": 0.05, + "rewards/formatting_reward_func/mean": 1.4625, + "rewards/formatting_reward_func/std": 0.025, + "rewards/quality_reward_func/mean": 1.184472641348839, + "rewards/quality_reward_func/std": 0.13674773583188654, + "step": 3120 + }, + { + "completion_length": 40.9, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.9, + "completions/max_terminated_length": 40.9, + "completions/mean_length": 40.15, + "completions/mean_terminated_length": 40.15, + "completions/min_length": 39.1, + "completions/min_terminated_length": 39.1, + "epoch": 0.018971766616964276, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.0, + "kl": 0.6743199843913317, + "learning_rate": 1.8462114845122582e-06, + "loss": 0.0, + "num_tokens": 4045341.0, + "reward": 4.098270344734192, + "reward_std": 0.039933528192341326, + "rewards/coherence_reward_func/mean": 0.9699999988079071, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.6282702922821044, + "rewards/quality_reward_func/std": 0.03993358239531517, + "step": 3130 + }, + { + "completion_length": 42.6, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.6, + "completions/max_terminated_length": 42.6, + "completions/mean_length": 40.35, + "completions/mean_terminated_length": 40.35, + "completions/min_length": 38.3, + "completions/min_terminated_length": 38.3, + "epoch": 0.019032379289861925, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20205184817314148, + "kl": 0.5343499461188912, + "learning_rate": 1.8293816479628584e-06, + "loss": 0.0, + "num_tokens": 4057039.0, + "reward": 3.8032551288604735, + "reward_std": 0.08389510039705783, + "rewards/coherence_reward_func/mean": 0.95, + "rewards/coherence_reward_func/std": 0.02309400886297226, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.3532551646232605, + "rewards/quality_reward_func/std": 0.0626519579673186, + "step": 3140 + }, + { + "completion_length": 45.4, + "completions/clipped_ratio": 0.0, + "completions/max_length": 45.4, + "completions/max_terminated_length": 45.4, + "completions/mean_length": 42.625, + "completions/mean_terminated_length": 42.625, + "completions/min_length": 39.5, + "completions/min_terminated_length": 39.5, + "epoch": 0.019092991962759574, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.24577243626117706, + "kl": 0.6044593669474125, + "learning_rate": 1.8125844964058354e-06, + "loss": 0.0001, + "num_tokens": 4072148.0, + "reward": 3.954366612434387, + "reward_std": 0.13101303055882454, + "rewards/coherence_reward_func/mean": 0.9699999988079071, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.4843665599822997, + "rewards/quality_reward_func/std": 0.13101302906870843, + "step": 3150 + }, + { + "completion_length": 50.6, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.6, + "completions/max_terminated_length": 50.6, + "completions/mean_length": 47.1, + "completions/mean_terminated_length": 47.1, + "completions/min_length": 44.2, + "completions/min_terminated_length": 44.2, + "epoch": 0.019153604635657222, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.10859663039445877, + "kl": 0.654767077229917, + "learning_rate": 1.7958208485106586e-06, + "loss": 0.0001, + "num_tokens": 4086408.0, + "reward": 4.028897976875305, + "reward_std": 0.039625269593670964, + "rewards/coherence_reward_func/mean": 0.9699999988079071, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.5588978946208953, + "rewards/quality_reward_func/std": 0.03962532239966095, + "step": 3160 + }, + { + "completion_length": 43.7, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.7, + "completions/max_terminated_length": 43.7, + "completions/mean_length": 40.05, + "completions/mean_terminated_length": 40.05, + "completions/min_length": 34.6, + "completions/min_terminated_length": 34.6, + "epoch": 0.01921421730855487, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18048399686813354, + "kl": 0.6584017806686461, + "learning_rate": 1.7790915213138777e-06, + "loss": 0.0001, + "num_tokens": 4100390.0, + "reward": 3.640557956695557, + "reward_std": 0.11440884659532458, + "rewards/coherence_reward_func/mean": 0.9925000011920929, + "rewards/coherence_reward_func/std": 0.015000002086162567, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.1480579733848573, + "rewards/quality_reward_func/std": 0.11514959074556827, + "step": 3170 + }, + { + "completion_length": 41.3, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.3, + "completions/max_terminated_length": 41.3, + "completions/mean_length": 39.625, + "completions/mean_terminated_length": 39.625, + "completions/min_length": 37.3, + "completions/min_terminated_length": 37.3, + "epoch": 0.019274829981452524, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.22853608429431915, + "kl": 0.5096729185432196, + "learning_rate": 1.7623973301792964e-06, + "loss": 0.0, + "num_tokens": 4112795.0, + "reward": 3.6959209203720094, + "reward_std": 0.09516287457663566, + "rewards/coherence_reward_func/mean": 0.8950000017881393, + "rewards/coherence_reward_func/std": 0.037320506572723386, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.3009209588170052, + "rewards/quality_reward_func/std": 0.06619152534985914, + "step": 3180 + }, + { + "completion_length": 37.1, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.1, + "completions/max_terminated_length": 37.1, + "completions/mean_length": 35.175, + "completions/mean_terminated_length": 35.175, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.019335442654350173, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.2885758578777313, + "kl": 1.076241620257497, + "learning_rate": 1.745739088758242e-06, + "loss": 0.0001, + "num_tokens": 4125142.0, + "reward": 3.7729340553283692, + "reward_std": 0.06683287937194109, + "rewards/coherence_reward_func/mean": 0.9925000011920929, + "rewards/coherence_reward_func/std": 0.015000002086162567, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.2804340660572051, + "rewards/quality_reward_func/std": 0.07385497200302779, + "step": 3190 + }, + { + "completion_length": 39.8, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.8, + "completions/max_terminated_length": 39.8, + "completions/mean_length": 37.675, + "completions/mean_terminated_length": 37.675, + "completions/min_length": 35.3, + "completions/min_terminated_length": 35.3, + "epoch": 0.01939605532724782, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.0, + "kl": 0.8483439676463604, + "learning_rate": 1.7291176089498969e-06, + "loss": 0.0001, + "num_tokens": 4136533.0, + "reward": 3.761235785484314, + "reward_std": 0.1376682033762336, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.2612357795238496, + "rewards/quality_reward_func/std": 0.13766820412129163, + "step": 3200 + }, + { + "completion_length": 39.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.5, + "completions/max_terminated_length": 39.5, + "completions/mean_length": 37.45, + "completions/mean_terminated_length": 37.45, + "completions/min_length": 34.9, + "completions/min_terminated_length": 34.9, + "epoch": 0.01945666800014547, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.43329519033432007, + "kl": 0.8543856620788575, + "learning_rate": 1.7125337008617387e-06, + "loss": 0.0001, + "num_tokens": 4148683.0, + "reward": 4.058108305931091, + "reward_std": 0.07306739278137683, + "rewards/coherence_reward_func/mean": 0.9699999988079071, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.5881082952022552, + "rewards/quality_reward_func/std": 0.07306739557534456, + "step": 3210 + }, + { + "completion_length": 50.8, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.8, + "completions/max_terminated_length": 50.8, + "completions/mean_length": 46.3, + "completions/mean_terminated_length": 46.3, + "completions/min_length": 42.4, + "completions/min_terminated_length": 42.4, + "epoch": 0.01951728067304312, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.298828661441803, + "kl": 0.5643864538520574, + "learning_rate": 1.6959881727700508e-06, + "loss": 0.0001, + "num_tokens": 4160563.0, + "reward": 3.963113474845886, + "reward_std": 0.10927062275586649, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.463113397359848, + "rewards/quality_reward_func/std": 0.10927061671391129, + "step": 3220 + }, + { + "completion_length": 36.4, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.4, + "completions/max_terminated_length": 36.4, + "completions/mean_length": 34.2, + "completions/mean_terminated_length": 34.2, + "completions/min_length": 32.3, + "completions/min_terminated_length": 32.3, + "epoch": 0.019577893345940768, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.6677718162536621, + "kl": 0.8229854173958302, + "learning_rate": 1.679481831080531e-06, + "loss": 0.0001, + "num_tokens": 4174915.0, + "reward": 3.848472833633423, + "reward_std": 0.10531683061271906, + "rewards/coherence_reward_func/mean": 0.9774999976158142, + "rewards/coherence_reward_func/std": 0.015000002086162567, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.3709728479385377, + "rewards/quality_reward_func/std": 0.09031676249578595, + "step": 3230 + }, + { + "completion_length": 38.7, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.7, + "completions/max_terminated_length": 38.7, + "completions/mean_length": 32.525, + "completions/mean_terminated_length": 32.525, + "completions/min_length": 28.2, + "completions/min_terminated_length": 28.2, + "epoch": 0.01963850601883842, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.2357165664434433, + "kl": 1.063683407753706, + "learning_rate": 1.6630154802889859e-06, + "loss": 0.0001, + "num_tokens": 4188168.0, + "reward": 3.6370152950286867, + "reward_std": 0.08581735389307141, + "rewards/coherence_reward_func/mean": 0.9699999988079071, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.1670151889324187, + "rewards/quality_reward_func/std": 0.08581737205386161, + "step": 3240 + }, + { + "completion_length": 40.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 37.575, + "completions/mean_terminated_length": 37.575, + "completions/min_length": 35.3, + "completions/min_terminated_length": 35.3, + "epoch": 0.01969911869173607, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.20572206377983093, + "kl": 0.7876749344170093, + "learning_rate": 1.6465899229421225e-06, + "loss": 0.0001, + "num_tokens": 4202543.0, + "reward": 3.983400321006775, + "reward_std": 0.047867730259895325, + "rewards/coherence_reward_func/mean": 0.9699999988079071, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.5134003400802611, + "rewards/quality_reward_func/std": 0.04786777477711439, + "step": 3250 + }, + { + "completion_length": 41.9, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.9, + "completions/max_terminated_length": 41.9, + "completions/mean_length": 39.8, + "completions/mean_terminated_length": 39.8, + "completions/min_length": 38.1, + "completions/min_terminated_length": 38.1, + "epoch": 0.019759731364633718, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16716791689395905, + "kl": 0.8925764039158821, + "learning_rate": 1.630205959598433e-06, + "loss": 0.0001, + "num_tokens": 4215987.0, + "reward": 3.917082405090332, + "reward_std": 0.09645090424455702, + "rewards/coherence_reward_func/mean": 0.9925000011920929, + "rewards/coherence_reward_func/std": 0.015000002086162567, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.424582439661026, + "rewards/quality_reward_func/std": 0.08145089484751225, + "step": 3260 + }, + { + "completion_length": 40.2, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.2, + "completions/max_terminated_length": 40.2, + "completions/mean_length": 38.55, + "completions/mean_terminated_length": 38.55, + "completions/min_length": 36.8, + "completions/min_terminated_length": 36.8, + "epoch": 0.019820344037531367, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.0, + "kl": 0.7571708537638188, + "learning_rate": 1.6138643887891765e-06, + "loss": 0.0001, + "num_tokens": 4229725.0, + "reward": 3.760947751998901, + "reward_std": 0.041113514872267845, + "rewards/coherence_reward_func/mean": 0.925, + "rewards/coherence_reward_func/std": 0.017320507764816286, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.3359477519989014, + "rewards/quality_reward_func/std": 0.038945339154452085, + "step": 3270 + }, + { + "completion_length": 38.8, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.8, + "completions/max_terminated_length": 38.8, + "completions/mean_length": 36.725, + "completions/mean_terminated_length": 36.725, + "completions/min_length": 35.3, + "completions/min_terminated_length": 35.3, + "epoch": 0.019880956710429016, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.2101743221282959, + "kl": 0.6681413215585053, + "learning_rate": 1.597566006979459e-06, + "loss": 0.0001, + "num_tokens": 4241722.0, + "reward": 4.057679986953735, + "reward_std": 0.09815444834530354, + "rewards/coherence_reward_func/mean": 0.9824999988079071, + "rewards/coherence_reward_func/std": 0.034999999403953555, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.5751800179481505, + "rewards/quality_reward_func/std": 0.0710665188729763, + "step": 3280 + }, + { + "completion_length": 36.2, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.2, + "completions/max_terminated_length": 36.2, + "completions/mean_length": 35.675, + "completions/mean_terminated_length": 35.675, + "completions/min_length": 35.1, + "completions/min_terminated_length": 35.1, + "epoch": 0.019941569383326665, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.15175789594650269, + "kl": 0.9505894485861063, + "learning_rate": 1.5813116085294172e-06, + "loss": 0.0001, + "num_tokens": 4254845.0, + "reward": 4.126423382759095, + "reward_std": 0.02618008037097752, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.6264233708381652, + "rewards/quality_reward_func/std": 0.026180061488412322, + "step": 3290 + }, + { + "completion_length": 39.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 34.775, + "completions/mean_terminated_length": 34.775, + "completions/min_length": 30.5, + "completions/min_terminated_length": 30.5, + "epoch": 0.020002182056224314, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18640796840190887, + "kl": 0.9450171794742346, + "learning_rate": 1.5651019856554995e-06, + "loss": 0.0001, + "num_tokens": 4270140.0, + "reward": 3.4720892190933226, + "reward_std": 0.18899639658629894, + "rewards/coherence_reward_func/mean": 0.9474999994039536, + "rewards/coherence_reward_func/std": 0.034999999403953555, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.0245892763137818, + "rewards/quality_reward_func/std": 0.153996386192739, + "step": 3300 + }, + { + "completion_length": 41.8, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.8, + "completions/max_terminated_length": 41.8, + "completions/mean_length": 37.875, + "completions/mean_terminated_length": 37.875, + "completions/min_length": 35.6, + "completions/min_terminated_length": 35.6, + "epoch": 0.020062794729121966, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.18235939741134644, + "kl": 0.6776700124144555, + "learning_rate": 1.5489379283918566e-06, + "loss": 0.0, + "num_tokens": 4282883.0, + "reward": 3.8034504652023315, + "reward_std": 0.09744351711124181, + "rewards/coherence_reward_func/mean": 0.9699999988079071, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.333450472354889, + "rewards/quality_reward_func/std": 0.09744351767003537, + "step": 3310 + }, + { + "completion_length": 45.1, + "completions/clipped_ratio": 0.0, + "completions/max_length": 45.1, + "completions/max_terminated_length": 45.1, + "completions/mean_length": 40.75, + "completions/mean_terminated_length": 40.75, + "completions/min_length": 36.5, + "completions/min_terminated_length": 36.5, + "epoch": 0.020123407402019615, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.37760046124458313, + "kl": 0.949538280069828, + "learning_rate": 1.5328202245518348e-06, + "loss": 0.0001, + "num_tokens": 4295249.0, + "reward": 3.8073550939559935, + "reward_std": 0.12139484300278128, + "rewards/coherence_reward_func/mean": 0.9675000011920929, + "rewards/coherence_reward_func/std": 0.052320507168769834, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.3398550689220428, + "rewards/quality_reward_func/std": 0.13598596872761845, + "step": 3320 + }, + { + "completion_length": 31.2, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.2, + "completions/max_terminated_length": 31.2, + "completions/mean_length": 30.95, + "completions/mean_terminated_length": 30.95, + "completions/min_length": 30.6, + "completions/min_terminated_length": 30.6, + "epoch": 0.020184020074917264, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0, + "kl": 0.9487326897680759, + "learning_rate": 1.5167496596895814e-06, + "loss": 0.0001, + "num_tokens": 4307515.0, + "reward": 3.90595965385437, + "reward_std": 0.07731840866617859, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.4059597253799438, + "rewards/quality_reward_func/std": 0.07731841276399791, + "step": 3330 + }, + { + "completion_length": 39.4, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.4, + "completions/max_terminated_length": 39.4, + "completions/mean_length": 34.775, + "completions/mean_terminated_length": 34.775, + "completions/min_length": 31.7, + "completions/min_terminated_length": 31.7, + "epoch": 0.020244632747814913, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.23343947529792786, + "kl": 0.8172248054295779, + "learning_rate": 1.500727017061756e-06, + "loss": 0.0001, + "num_tokens": 4319442.0, + "reward": 3.517696112394333, + "reward_std": 0.2403232785873115, + "rewards/coherence_reward_func/mean": 0.925, + "rewards/coherence_reward_func/std": 0.05, + "rewards/formatting_reward_func/mean": 1.3875, + "rewards/formatting_reward_func/std": 0.075, + "rewards/quality_reward_func/mean": 1.2051961183547975, + "rewards/quality_reward_func/std": 0.11532331230118871, + "step": 3340 + }, + { + "completion_length": 39.8, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.8, + "completions/max_terminated_length": 39.8, + "completions/mean_length": 36.0, + "completions/mean_terminated_length": 36.0, + "completions/min_length": 32.6, + "completions/min_terminated_length": 32.6, + "epoch": 0.02030524542071256, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.15448229014873505, + "kl": 0.7963626474142075, + "learning_rate": 1.4847530775893555e-06, + "loss": 0.0001, + "num_tokens": 4328866.0, + "reward": 3.692489814758301, + "reward_std": 0.12964325528591872, + "rewards/coherence_reward_func/mean": 0.9850000023841858, + "rewards/coherence_reward_func/std": 0.017320507764816286, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.207489800453186, + "rewards/quality_reward_func/std": 0.11372499875724315, + "step": 3350 + }, + { + "completion_length": 41.6, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.6, + "completions/max_terminated_length": 41.6, + "completions/mean_length": 39.8, + "completions/mean_terminated_length": 39.8, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.02036585809361021, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.09208908677101135, + "kl": 0.723385076597333, + "learning_rate": 1.4688286198196524e-06, + "loss": 0.0001, + "num_tokens": 4341758.0, + "reward": 3.7400954008102416, + "reward_std": 0.06603281607385725, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.2400953948497773, + "rewards/quality_reward_func/std": 0.06603283025324344, + "step": 3360 + }, + { + "completion_length": 34.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 31.975, + "completions/mean_terminated_length": 31.975, + "completions/min_length": 30.2, + "completions/min_terminated_length": 30.2, + "epoch": 0.020426470766507863, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.2501620054244995, + "kl": 0.9304806463420391, + "learning_rate": 1.4529544198882545e-06, + "loss": 0.0001, + "num_tokens": 4354805.0, + "reward": 3.899626541137695, + "reward_std": 0.0629223863594234, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.3996265590190888, + "rewards/quality_reward_func/std": 0.06292239651083946, + "step": 3370 + }, + { + "completion_length": 38.8, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.8, + "completions/max_terminated_length": 38.8, + "completions/mean_length": 37.15, + "completions/mean_terminated_length": 37.15, + "completions/min_length": 35.8, + "completions/min_terminated_length": 35.8, + "epoch": 0.02048708343940551, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.0, + "kl": 1.0188420228660107, + "learning_rate": 1.4371312514812686e-06, + "loss": 0.0001, + "num_tokens": 4369087.0, + "reward": 4.25221619606018, + "reward_std": 0.07234206513967364, + "rewards/coherence_reward_func/mean": 0.9699999988079071, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.7822161555290221, + "rewards/quality_reward_func/std": 0.07234203468542547, + "step": 3380 + }, + { + "completion_length": 39.3, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.3, + "completions/max_terminated_length": 39.3, + "completions/mean_length": 37.65, + "completions/mean_terminated_length": 37.65, + "completions/min_length": 35.9, + "completions/min_terminated_length": 35.9, + "epoch": 0.02054769611230316, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.14413855969905853, + "kl": 0.8577508796006441, + "learning_rate": 1.4213598857976024e-06, + "loss": 0.0001, + "num_tokens": 4384409.0, + "reward": 3.8944859981536863, + "reward_std": 0.11143267480656505, + "rewards/coherence_reward_func/mean": 0.9699999988079071, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.4244859516620636, + "rewards/quality_reward_func/std": 0.11143264099955559, + "step": 3390 + }, + { + "completion_length": 36.8, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.8, + "completions/max_terminated_length": 36.8, + "completions/mean_length": 34.45, + "completions/mean_terminated_length": 34.45, + "completions/min_length": 31.4, + "completions/min_terminated_length": 31.4, + "epoch": 0.02060830878520081, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.0, + "kl": 1.0671353876590728, + "learning_rate": 1.405641091511368e-06, + "loss": 0.0001, + "num_tokens": 4396347.0, + "reward": 3.7803528785705565, + "reward_std": 0.07597720525227487, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.2803528964519502, + "rewards/quality_reward_func/std": 0.07597720911726355, + "step": 3400 + }, + { + "completion_length": 37.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.5, + "completions/max_terminated_length": 37.5, + "completions/mean_length": 34.575, + "completions/mean_terminated_length": 34.575, + "completions/min_length": 32.2, + "completions/min_terminated_length": 32.2, + "epoch": 0.02066892145809846, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.0, + "kl": 0.9999676614999771, + "learning_rate": 1.3899756347344235e-06, + "loss": 0.0001, + "num_tokens": 4409734.0, + "reward": 4.007402873039245, + "reward_std": 0.06215168377384543, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.5074028015136718, + "rewards/quality_reward_func/std": 0.06215167883783579, + "step": 3410 + }, + { + "completion_length": 33.8, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.8, + "completions/max_terminated_length": 33.8, + "completions/mean_length": 32.0, + "completions/mean_terminated_length": 32.0, + "completions/min_length": 30.6, + "completions/min_terminated_length": 30.6, + "epoch": 0.020729534130996107, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.2397295981645584, + "kl": 0.9598039738833904, + "learning_rate": 1.3743642789790317e-06, + "loss": 0.0001, + "num_tokens": 4420570.0, + "reward": 3.8145102739334105, + "reward_std": 0.07386084916070104, + "rewards/coherence_reward_func/mean": 0.9699999988079071, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.3445102512836455, + "rewards/quality_reward_func/std": 0.07386083230376243, + "step": 3420 + }, + { + "completion_length": 34.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 32.725, + "completions/mean_terminated_length": 32.725, + "completions/min_length": 31.8, + "completions/min_terminated_length": 31.8, + "epoch": 0.02079014680389376, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.24414551258087158, + "kl": 0.7642092987895012, + "learning_rate": 1.358807785120647e-06, + "loss": 0.0001, + "num_tokens": 4436575.0, + "reward": 3.7026368618011474, + "reward_std": 0.0684517988935113, + "rewards/coherence_reward_func/mean": 1.0, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.2026368647813797, + "rewards/quality_reward_func/std": 0.06845179051160813, + "step": 3430 + }, + { + "completion_length": 34.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.5, + "completions/max_terminated_length": 34.5, + "completions/mean_length": 33.625, + "completions/mean_terminated_length": 33.625, + "completions/min_length": 32.8, + "completions/min_terminated_length": 32.8, + "epoch": 0.02085075947679141, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.40003758668899536, + "kl": 0.7258573945611715, + "learning_rate": 1.343306911360833e-06, + "loss": 0.0, + "num_tokens": 4447768.0, + "reward": 3.737292194366455, + "reward_std": 0.12735685943625868, + "rewards/coherence_reward_func/mean": 0.9925000011920929, + "rewards/coherence_reward_func/std": 0.015000002086162567, + "rewards/formatting_reward_func/mean": 1.5, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.244792139530182, + "rewards/quality_reward_func/std": 0.11480730390176178, + "step": 3440 + }, + { + "completion_length": 36.2, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.2, + "completions/max_terminated_length": 36.2, + "completions/mean_length": 33.15, + "completions/mean_terminated_length": 33.15, + "completions/min_length": 31.6, + "completions/min_terminated_length": 31.6, + "epoch": 0.020911372149689057, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.867883026599884, + "kl": 1.2108185835182668, + "learning_rate": 1.3278624131903088e-06, + "loss": 0.0001, + "num_tokens": 4458758.0, + "reward": 3.524269163608551, + "reward_std": 0.07325016092509032, + "rewards/coherence_reward_func/mean": 0.9, + "rewards/coherence_reward_func/std": 0.0, + "rewards/formatting_reward_func/mean": 1.45, + "rewards/formatting_reward_func/std": 0.0, + "rewards/quality_reward_func/mean": 1.1742691427469254, + "rewards/quality_reward_func/std": 0.0732501860242337, + "step": 3450 } ], "logging_steps": 10, - "max_steps": 2500, - "num_input_tokens_seen": 3034389, + "max_steps": 5000, + "num_input_tokens_seen": 4458758, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": {