{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.09074410163339383, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.00018148820326678765, "frac_reward_zero_std": 0.0, "grad_norm": 0.08199785649776459, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "num_tokens": 13468.0, "reward": 0.675000011920929, "reward_std": 0.471699059009552, "rewards/wrapped_reward_func/mean": 0.675000011920929, "rewards/wrapped_reward_func/std": 0.471699059009552, "step": 1 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0003629764065335753, "frac_reward_zero_std": 0.0, "grad_norm": 0.1479763686656952, "kl": 0.0, "learning_rate": 1.0000000000000001e-07, "loss": 0.0, "num_tokens": 26936.0, "reward": 0.75, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.5, "step": 2 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.000544464609800363, "frac_reward_zero_std": 0.0, "grad_norm": 0.17236804962158203, "kl": 0.0008469677122775465, "learning_rate": 2.0000000000000002e-07, "loss": 0.0, "num_tokens": 33528.0, "reward": 0.42500001192092896, "reward_std": 0.5057997107505798, "rewards/wrapped_reward_func/mean": 0.42500001192092896, "rewards/wrapped_reward_func/std": 0.5057997107505798, "step": 3 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0007259528130671506, "frac_reward_zero_std": 0.0, "grad_norm": 0.07612831890583038, "kl": 0.000500970461871475, "learning_rate": 3.0000000000000004e-07, "loss": 0.0, "num_tokens": 40120.0, "reward": 0.17499999701976776, "reward_std": 0.3499999940395355, "rewards/wrapped_reward_func/mean": 0.17499999701976776, "rewards/wrapped_reward_func/std": 0.3499999940395355, "step": 4 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0009074410163339383, "frac_reward_zero_std": 1.0, "grad_norm": 0.00011424272815929726, "kl": 0.0009664897515904158, "learning_rate": 4.0000000000000003e-07, "loss": 0.0, "num_tokens": 45892.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 5 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.001088929219600726, "frac_reward_zero_std": 1.0, "grad_norm": 4.139049997320399e-05, "kl": 0.00040519051253795624, "learning_rate": 5.000000000000001e-07, "loss": 0.0, "num_tokens": 51664.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 6 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0012704174228675136, "frac_reward_zero_std": 0.0, "grad_norm": 0.14961789548397064, "kl": 0.0008240456809289753, "learning_rate": 6.000000000000001e-07, "loss": 0.0, "num_tokens": 62564.0, "reward": 0.75, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.5, "step": 7 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0014519056261343012, "frac_reward_zero_std": 0.0, "grad_norm": 0.12039563059806824, "kl": 0.00035998484236188233, "learning_rate": 7.000000000000001e-07, "loss": 0.0, "num_tokens": 73464.0, "reward": 0.42500001192092896, "reward_std": 0.5057997107505798, "rewards/wrapped_reward_func/mean": 0.42500001192092896, "rewards/wrapped_reward_func/std": 0.5057997107505798, "step": 8 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.001633393829401089, "frac_reward_zero_std": 1.0, "grad_norm": 3.7600046198349446e-05, "kl": 0.000430698215495795, "learning_rate": 8.000000000000001e-07, "loss": 0.0, "num_tokens": 79468.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 9 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0018148820326678765, "frac_reward_zero_std": 1.0, "grad_norm": 7.923651719465852e-05, "kl": 0.0008369213610421866, "learning_rate": 9e-07, "loss": 0.0, "num_tokens": 85472.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 10 }, { "completion_length": 442.75, "completions/clipped_ratio": 0.75, "completions/max_length": 512.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 442.75, "completions/mean_terminated_length": 235.0, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.001996370235934664, "frac_reward_zero_std": 0.0, "grad_norm": 0.13887488842010498, "kl": 0.0007510318828281015, "learning_rate": 1.0000000000000002e-06, "loss": 0.071, "num_tokens": 90331.0, "reward": 0.17499999701976776, "reward_std": 0.3499999940395355, "rewards/wrapped_reward_func/mean": 0.17499999701976776, "rewards/wrapped_reward_func/std": 0.3499999940395355, "step": 11 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.002177858439201452, "frac_reward_zero_std": 0.0, "grad_norm": 0.15148557722568512, "kl": 0.0005390114383772016, "learning_rate": 1.1e-06, "loss": 0.0, "num_tokens": 95467.0, "reward": 0.42500001192092896, "reward_std": 0.5057997107505798, "rewards/wrapped_reward_func/mean": 0.42500001192092896, "rewards/wrapped_reward_func/std": 0.5057997107505798, "step": 12 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0023593466424682396, "frac_reward_zero_std": 1.0, "grad_norm": 5.918050737818703e-05, "kl": 0.0005271413829177618, "learning_rate": 1.2000000000000002e-06, "loss": 0.0, "num_tokens": 102495.0, "reward": 0.699999988079071, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.699999988079071, "rewards/wrapped_reward_func/std": 0.0, "step": 13 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.002540834845735027, "frac_reward_zero_std": 1.0, "grad_norm": 6.525542994495481e-05, "kl": 0.0004630213079508394, "learning_rate": 1.3e-06, "loss": 0.0, "num_tokens": 109523.0, "reward": 0.699999988079071, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.699999988079071, "rewards/wrapped_reward_func/std": 0.0, "step": 14 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0027223230490018148, "frac_reward_zero_std": 0.0, "grad_norm": 0.03521356359124184, "kl": 0.0008276549633592367, "learning_rate": 1.4000000000000001e-06, "loss": 0.0, "num_tokens": 113763.0, "reward": 0.949999988079071, "reward_std": 0.05773504078388214, "rewards/wrapped_reward_func/mean": 0.949999988079071, "rewards/wrapped_reward_func/std": 0.05773504078388214, "step": 15 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0029038112522686023, "frac_reward_zero_std": 0.0, "grad_norm": 0.11479217559099197, "kl": 0.00037548685213550925, "learning_rate": 1.5e-06, "loss": 0.0, "num_tokens": 118003.0, "reward": 0.7250000238418579, "reward_std": 0.4856267273426056, "rewards/wrapped_reward_func/mean": 0.7250000238418579, "rewards/wrapped_reward_func/std": 0.4856267273426056, "step": 16 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0030852994555353903, "frac_reward_zero_std": 0.0, "grad_norm": 0.16638903319835663, "kl": 0.0005360331851989031, "learning_rate": 1.6000000000000001e-06, "loss": 0.0, "num_tokens": 123007.0, "reward": 0.5, "reward_std": 0.5773502588272095, "rewards/wrapped_reward_func/mean": 0.5, "rewards/wrapped_reward_func/std": 0.5773502588272095, "step": 17 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.003266787658802178, "frac_reward_zero_std": 0.0, "grad_norm": 0.14312264323234558, "kl": 0.00025895965518429875, "learning_rate": 1.7000000000000002e-06, "loss": 0.0, "num_tokens": 128011.0, "reward": 0.75, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.5, "step": 18 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0034482758620689655, "frac_reward_zero_std": 1.0, "grad_norm": 4.9306079745292664e-05, "kl": 0.00046938162995502353, "learning_rate": 1.8e-06, "loss": 0.0, "num_tokens": 134215.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 19 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.003629764065335753, "frac_reward_zero_std": 0.0, "grad_norm": 0.11429649591445923, "kl": 0.0006803146388847381, "learning_rate": 1.9e-06, "loss": 0.0, "num_tokens": 140419.0, "reward": 0.17499999701976776, "reward_std": 0.3499999940395355, "rewards/wrapped_reward_func/mean": 0.17499999701976776, "rewards/wrapped_reward_func/std": 0.3499999940395355, "step": 20 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.003811252268602541, "frac_reward_zero_std": 1.0, "grad_norm": 3.943451156374067e-05, "kl": 0.00040814600652083755, "learning_rate": 2.0000000000000003e-06, "loss": 0.0, "num_tokens": 147891.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 21 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.003992740471869328, "frac_reward_zero_std": 1.0, "grad_norm": 0.00010384265624452382, "kl": 0.0006963205814827234, "learning_rate": 2.1000000000000002e-06, "loss": 0.0, "num_tokens": 155363.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 22 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.004174228675136116, "frac_reward_zero_std": 0.0, "grad_norm": 0.06597988307476044, "kl": 0.0006129970424808562, "learning_rate": 2.2e-06, "loss": 0.0, "num_tokens": 159063.0, "reward": 0.30000001192092896, "reward_std": 0.20000000298023224, "rewards/wrapped_reward_func/mean": 0.30000001192092896, "rewards/wrapped_reward_func/std": 0.20000000298023224, "step": 23 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.004355716878402904, "frac_reward_zero_std": 0.0, "grad_norm": 0.08174296468496323, "kl": 0.0005434465711005032, "learning_rate": 2.3e-06, "loss": 0.0, "num_tokens": 162763.0, "reward": 0.30000001192092896, "reward_std": 0.20000000298023224, "rewards/wrapped_reward_func/mean": 0.30000001192092896, "rewards/wrapped_reward_func/std": 0.20000000298023224, "step": 24 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.004537205081669692, "frac_reward_zero_std": 1.0, "grad_norm": 1.9328228518133983e-05, "kl": 0.00022128832642920315, "learning_rate": 2.4000000000000003e-06, "loss": 0.0, "num_tokens": 173111.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 25 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.004718693284936479, "frac_reward_zero_std": 0.0, "grad_norm": 0.07385554909706116, "kl": 0.0001980169618036598, "learning_rate": 2.5e-06, "loss": 0.0, "num_tokens": 183459.0, "reward": 0.6000000238418579, "reward_std": 0.4242640733718872, "rewards/wrapped_reward_func/mean": 0.6000000238418579, "rewards/wrapped_reward_func/std": 0.4242640733718872, "step": 26 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.004900181488203267, "frac_reward_zero_std": 0.0, "grad_norm": 0.007210784126073122, "kl": 0.00030259849154390395, "learning_rate": 2.6e-06, "loss": 0.0, "num_tokens": 197267.0, "reward": 0.02500000037252903, "reward_std": 0.05000000074505806, "rewards/wrapped_reward_func/mean": 0.02500000037252903, "rewards/wrapped_reward_func/std": 0.05000000074505806, "step": 27 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.005081669691470054, "frac_reward_zero_std": 1.0, "grad_norm": 2.872776531148702e-05, "kl": 0.00034294030047021806, "learning_rate": 2.7e-06, "loss": 0.0, "num_tokens": 211075.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 28 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.005263157894736842, "frac_reward_zero_std": 0.0, "grad_norm": 0.03165740892291069, "kl": 0.00034573746961541474, "learning_rate": 2.8000000000000003e-06, "loss": 0.0, "num_tokens": 218407.0, "reward": 0.625, "reward_std": 0.14999999105930328, "rewards/wrapped_reward_func/mean": 0.625, "rewards/wrapped_reward_func/std": 0.14999999105930328, "step": 29 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0054446460980036296, "frac_reward_zero_std": 1.0, "grad_norm": 5.250574758974835e-05, "kl": 0.000510701589519158, "learning_rate": 2.9e-06, "loss": 0.0, "num_tokens": 225739.0, "reward": 0.699999988079071, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.699999988079071, "rewards/wrapped_reward_func/std": 0.0, "step": 30 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.005626134301270417, "frac_reward_zero_std": 0.0, "grad_norm": 0.0869903638958931, "kl": 0.00028751895297318697, "learning_rate": 3e-06, "loss": 0.0, "num_tokens": 237019.0, "reward": 0.75, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.5, "step": 31 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.005807622504537205, "frac_reward_zero_std": 1.0, "grad_norm": 2.7712700102711096e-05, "kl": 0.0003089166712015867, "learning_rate": 3.1e-06, "loss": 0.0, "num_tokens": 248299.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 32 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.005989110707803993, "frac_reward_zero_std": 0.0, "grad_norm": 0.06308193504810333, "kl": 0.0005865825805813074, "learning_rate": 3.2000000000000003e-06, "loss": 0.0, "num_tokens": 255711.0, "reward": 0.8500000238418579, "reward_std": 0.17320507764816284, "rewards/wrapped_reward_func/mean": 0.8500000238418579, "rewards/wrapped_reward_func/std": 0.17320507764816284, "step": 33 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.006170598911070781, "frac_reward_zero_std": 0.0, "grad_norm": 0.1916787028312683, "kl": 0.0008034926722757518, "learning_rate": 3.3e-06, "loss": 0.0, "num_tokens": 263123.0, "reward": 0.675000011920929, "reward_std": 0.471699059009552, "rewards/wrapped_reward_func/mean": 0.675000011920929, "rewards/wrapped_reward_func/std": 0.471699059009552, "step": 34 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.006352087114337568, "frac_reward_zero_std": 1.0, "grad_norm": 8.181727025657892e-05, "kl": 0.000809456076240167, "learning_rate": 3.4000000000000005e-06, "loss": 0.0, "num_tokens": 266555.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 35 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.006533575317604356, "frac_reward_zero_std": 1.0, "grad_norm": 4.240201451466419e-05, "kl": 0.000527553609572351, "learning_rate": 3.5000000000000004e-06, "loss": 0.0, "num_tokens": 269987.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 36 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.006715063520871143, "frac_reward_zero_std": 0.0, "grad_norm": 0.1353188008069992, "kl": 0.0006768091698177159, "learning_rate": 3.6e-06, "loss": 0.0, "num_tokens": 283243.0, "reward": 0.5, "reward_std": 0.5773502588272095, "rewards/wrapped_reward_func/mean": 0.5, "rewards/wrapped_reward_func/std": 0.5773502588272095, "step": 37 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.006896551724137931, "frac_reward_zero_std": 0.0, "grad_norm": 0.1198098435997963, "kl": 0.0004984055121894926, "learning_rate": 3.7e-06, "loss": 0.0, "num_tokens": 296499.0, "reward": 0.75, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.5, "step": 38 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0070780399274047185, "frac_reward_zero_std": 1.0, "grad_norm": 4.017218088847585e-05, "kl": 0.00021315019694156945, "learning_rate": 3.8e-06, "loss": 0.0, "num_tokens": 302071.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 39 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.007259528130671506, "frac_reward_zero_std": 0.0, "grad_norm": 0.17292706668376923, "kl": 0.0007002031488809735, "learning_rate": 3.9e-06, "loss": 0.0, "num_tokens": 307643.0, "reward": 0.75, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.5, "step": 40 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.007441016333938294, "frac_reward_zero_std": 1.0, "grad_norm": 6.0095950175309554e-05, "kl": 0.0005370419530663639, "learning_rate": 4.000000000000001e-06, "loss": 0.0, "num_tokens": 313167.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 41 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.007622504537205082, "frac_reward_zero_std": 1.0, "grad_norm": 5.908232560614124e-05, "kl": 0.0003241816011723131, "learning_rate": 4.1000000000000006e-06, "loss": 0.0, "num_tokens": 318691.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 42 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.00780399274047187, "frac_reward_zero_std": 0.0, "grad_norm": 0.08644702285528183, "kl": 0.0003747711598407477, "learning_rate": 4.2000000000000004e-06, "loss": 0.0, "num_tokens": 325083.0, "reward": 0.5, "reward_std": 0.39157798886299133, "rewards/wrapped_reward_func/mean": 0.5, "rewards/wrapped_reward_func/std": 0.39157798886299133, "step": 43 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.007985480943738656, "frac_reward_zero_std": 0.0, "grad_norm": 0.08594585210084915, "kl": 0.0004911240248475224, "learning_rate": 4.2999999999999995e-06, "loss": 0.0, "num_tokens": 331475.0, "reward": 0.5249999761581421, "reward_std": 0.3499999940395355, "rewards/wrapped_reward_func/mean": 0.5249999761581421, "rewards/wrapped_reward_func/std": 0.3499999940395355, "step": 44 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.008166969147005444, "frac_reward_zero_std": 0.0, "grad_norm": 0.10985252261161804, "kl": 0.00032619296689517796, "learning_rate": 4.4e-06, "loss": 0.0, "num_tokens": 341347.0, "reward": 0.675000011920929, "reward_std": 0.4716990292072296, "rewards/wrapped_reward_func/mean": 0.675000011920929, "rewards/wrapped_reward_func/std": 0.471699059009552, "step": 45 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.008348457350272231, "frac_reward_zero_std": 1.0, "grad_norm": 7.781680324114859e-05, "kl": 0.0007697606342844665, "learning_rate": 4.5e-06, "loss": 0.0, "num_tokens": 351219.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 46 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.00852994555353902, "frac_reward_zero_std": 1.0, "grad_norm": 3.769352406379767e-05, "kl": 0.0005859671218786389, "learning_rate": 4.6e-06, "loss": 0.0, "num_tokens": 354623.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 47 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.008711433756805808, "frac_reward_zero_std": 1.0, "grad_norm": 7.562957762274891e-05, "kl": 0.0004635382501874119, "learning_rate": 4.7e-06, "loss": 0.0, "num_tokens": 358027.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 48 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.008892921960072596, "frac_reward_zero_std": 0.0, "grad_norm": 0.23114216327667236, "kl": 0.0009065223857760429, "learning_rate": 4.800000000000001e-06, "loss": 0.0, "num_tokens": 365287.0, "reward": 0.75, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.5, "step": 49 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.009074410163339383, "frac_reward_zero_std": 0.0, "grad_norm": 0.10325120389461517, "kl": 0.00037806713953614235, "learning_rate": 4.9000000000000005e-06, "loss": 0.0, "num_tokens": 372547.0, "reward": 0.75, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.5, "step": 50 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.009255898366606171, "frac_reward_zero_std": 0.0, "grad_norm": 0.06120739132165909, "kl": 0.0004869922122452408, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 382127.0, "reward": 0.30000001192092896, "reward_std": 0.20000000298023224, "rewards/wrapped_reward_func/mean": 0.30000001192092896, "rewards/wrapped_reward_func/std": 0.20000000298023224, "step": 51 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.009437386569872959, "frac_reward_zero_std": 0.0, "grad_norm": 0.09679339081048965, "kl": 0.0005458757805172354, "learning_rate": 5.1e-06, "loss": 0.0, "num_tokens": 391707.0, "reward": 0.6000000238418579, "reward_std": 0.4242640733718872, "rewards/wrapped_reward_func/mean": 0.6000000238418579, "rewards/wrapped_reward_func/std": 0.4242640733718872, "step": 52 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.009618874773139746, "frac_reward_zero_std": 1.0, "grad_norm": 4.003878348157741e-05, "kl": 0.00048135698307305574, "learning_rate": 5.2e-06, "loss": 0.0, "num_tokens": 400243.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 53 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.009800362976406534, "frac_reward_zero_std": 1.0, "grad_norm": 8.340876956935972e-05, "kl": 0.0006227846024557948, "learning_rate": 5.3e-06, "loss": 0.0, "num_tokens": 408779.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 54 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.009981851179673321, "frac_reward_zero_std": 1.0, "grad_norm": 7.871934212744236e-05, "kl": 0.0008791287546046078, "learning_rate": 5.4e-06, "loss": 0.0, "num_tokens": 412775.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 55 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.010163339382940109, "frac_reward_zero_std": 1.0, "grad_norm": 2.6749015887617134e-05, "kl": 0.00047088603605516255, "learning_rate": 5.500000000000001e-06, "loss": 0.0, "num_tokens": 416771.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 56 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.010344827586206896, "frac_reward_zero_std": 1.0, "grad_norm": 4.4247186451684684e-05, "kl": 0.00042114188545383513, "learning_rate": 5.600000000000001e-06, "loss": 0.0, "num_tokens": 427427.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 57 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.010526315789473684, "frac_reward_zero_std": 0.0, "grad_norm": 0.07081616669893265, "kl": 0.0002741820935625583, "learning_rate": 5.7000000000000005e-06, "loss": 0.0, "num_tokens": 438083.0, "reward": 0.75, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.5, "step": 58 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.010707803992740472, "frac_reward_zero_std": 1.0, "grad_norm": 4.23256351496093e-05, "kl": 0.0002868936862796545, "learning_rate": 5.8e-06, "loss": 0.0, "num_tokens": 443467.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 59 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.010889292196007259, "frac_reward_zero_std": 1.0, "grad_norm": 1.621198134671431e-05, "kl": 0.0001916858891490847, "learning_rate": 5.9e-06, "loss": 0.0, "num_tokens": 448851.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 60 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.011070780399274047, "frac_reward_zero_std": 0.0, "grad_norm": 0.09844420850276947, "kl": 0.0004484665405470878, "learning_rate": 6e-06, "loss": 0.0, "num_tokens": 459415.0, "reward": 0.6000000238418579, "reward_std": 0.4242640733718872, "rewards/wrapped_reward_func/mean": 0.6000000238418579, "rewards/wrapped_reward_func/std": 0.4242640733718872, "step": 61 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.011252268602540834, "frac_reward_zero_std": 0.0, "grad_norm": 0.048960305750370026, "kl": 0.0006774089997634292, "learning_rate": 6.1e-06, "loss": 0.0, "num_tokens": 469979.0, "reward": 0.875, "reward_std": 0.14999999105930328, "rewards/wrapped_reward_func/mean": 0.875, "rewards/wrapped_reward_func/std": 0.15000000596046448, "step": 62 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.011433756805807622, "frac_reward_zero_std": 0.0, "grad_norm": 0.11348327249288559, "kl": 0.0004445563245099038, "learning_rate": 6.2e-06, "loss": 0.0, "num_tokens": 483127.0, "reward": 0.5, "reward_std": 0.5773502588272095, "rewards/wrapped_reward_func/mean": 0.5, "rewards/wrapped_reward_func/std": 0.5773502588272095, "step": 63 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.01161524500907441, "frac_reward_zero_std": 0.0, "grad_norm": 0.12369108945131302, "kl": 0.0007216291560325772, "learning_rate": 6.300000000000001e-06, "loss": 0.0, "num_tokens": 496275.0, "reward": 0.75, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.5, "step": 64 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.011796733212341199, "frac_reward_zero_std": 1.0, "grad_norm": 0.00014577335969079286, "kl": 0.0008473919006064534, "learning_rate": 6.4000000000000006e-06, "loss": 0.0, "num_tokens": 501543.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 65 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.011978221415607986, "frac_reward_zero_std": 0.0, "grad_norm": 0.15841621160507202, "kl": 0.001075494394171983, "learning_rate": 6.5000000000000004e-06, "loss": 0.0, "num_tokens": 506811.0, "reward": 0.75, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.5, "step": 66 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.012159709618874774, "frac_reward_zero_std": 0.0, "grad_norm": 0.03461180254817009, "kl": 0.0005635067936964333, "learning_rate": 6.6e-06, "loss": 0.0, "num_tokens": 513119.0, "reward": 0.925000011920929, "reward_std": 0.14999999105930328, "rewards/wrapped_reward_func/mean": 0.925000011920929, "rewards/wrapped_reward_func/std": 0.15000002086162567, "step": 67 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.012341197822141561, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008325293892994523, "kl": 0.0011473970953375101, "learning_rate": 6.700000000000001e-06, "loss": 0.0, "num_tokens": 519427.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 68 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.012522686025408349, "frac_reward_zero_std": 0.0, "grad_norm": 0.1737905889749527, "kl": 0.0005986682663206011, "learning_rate": 6.800000000000001e-06, "loss": 0.0, "num_tokens": 526731.0, "reward": 0.3499999940395355, "reward_std": 0.4725815951824188, "rewards/wrapped_reward_func/mean": 0.3499999940395355, "rewards/wrapped_reward_func/std": 0.47258156538009644, "step": 69 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.012704174228675136, "frac_reward_zero_std": 0.0, "grad_norm": 0.16673524677753448, "kl": 0.000555776699911803, "learning_rate": 6.900000000000001e-06, "loss": 0.0, "num_tokens": 534035.0, "reward": 0.25, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.25, "rewards/wrapped_reward_func/std": 0.5, "step": 70 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.012885662431941924, "frac_reward_zero_std": 1.0, "grad_norm": 2.859368578356225e-05, "kl": 0.0004669393820222467, "learning_rate": 7.000000000000001e-06, "loss": 0.0, "num_tokens": 541579.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 71 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.013067150635208712, "frac_reward_zero_std": 1.0, "grad_norm": 2.926980596384965e-05, "kl": 0.0003753226192202419, "learning_rate": 7.1e-06, "loss": 0.0, "num_tokens": 549123.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 72 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0132486388384755, "frac_reward_zero_std": 1.0, "grad_norm": 9.797024540603161e-05, "kl": 0.0007943064847495407, "learning_rate": 7.2e-06, "loss": 0.0, "num_tokens": 565803.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 73 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.013430127041742287, "frac_reward_zero_std": 1.0, "grad_norm": 6.185122765600681e-05, "kl": 0.0007032470894046128, "learning_rate": 7.2999999999999996e-06, "loss": 0.0, "num_tokens": 582483.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 74 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.013611615245009074, "frac_reward_zero_std": 0.0, "grad_norm": 0.1097329705953598, "kl": 0.00039491921779699624, "learning_rate": 7.4e-06, "loss": 0.0, "num_tokens": 589883.0, "reward": 0.25, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.25, "rewards/wrapped_reward_func/std": 0.5, "step": 75 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.013793103448275862, "frac_reward_zero_std": 0.0, "grad_norm": 0.23440521955490112, "kl": 0.0008568935154471546, "learning_rate": 7.5e-06, "loss": 0.0, "num_tokens": 597283.0, "reward": 0.25, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.25, "rewards/wrapped_reward_func/std": 0.5, "step": 76 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.01397459165154265, "frac_reward_zero_std": 0.0, "grad_norm": 0.18924948573112488, "kl": 0.0005183700122870505, "learning_rate": 7.6e-06, "loss": 0.0, "num_tokens": 604439.0, "reward": 0.75, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.5, "step": 77 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.014156079854809437, "frac_reward_zero_std": 0.0, "grad_norm": 0.02411881648004055, "kl": 0.0009505435009486973, "learning_rate": 7.7e-06, "loss": 0.0, "num_tokens": 611595.0, "reward": 0.949999988079071, "reward_std": 0.05773504078388214, "rewards/wrapped_reward_func/mean": 0.949999988079071, "rewards/wrapped_reward_func/std": 0.05773504078388214, "step": 78 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.014337568058076225, "frac_reward_zero_std": 1.0, "grad_norm": 0.00011382957745809108, "kl": 0.0008717156597413123, "learning_rate": 7.8e-06, "loss": 0.0, "num_tokens": 615587.0, "reward": 0.8999999761581421, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.8999999761581421, "rewards/wrapped_reward_func/std": 0.0, "step": 79 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.014519056261343012, "frac_reward_zero_std": 0.0, "grad_norm": 0.029936159029603004, "kl": 0.0002899237151723355, "learning_rate": 7.9e-06, "loss": 0.0, "num_tokens": 619579.0, "reward": 0.8499999642372131, "reward_std": 0.10000000149011612, "rewards/wrapped_reward_func/mean": 0.8499999642372131, "rewards/wrapped_reward_func/std": 0.10000000149011612, "step": 80 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0147005444646098, "frac_reward_zero_std": 1.0, "grad_norm": 2.7232792490394786e-05, "kl": 0.0006379409460350871, "learning_rate": 8.000000000000001e-06, "loss": 0.0, "num_tokens": 632727.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 81 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.014882032667876587, "frac_reward_zero_std": 0.0, "grad_norm": 0.14906378090381622, "kl": 0.0005009788146708161, "learning_rate": 8.1e-06, "loss": 0.0, "num_tokens": 645875.0, "reward": 0.5, "reward_std": 0.5773502588272095, "rewards/wrapped_reward_func/mean": 0.5, "rewards/wrapped_reward_func/std": 0.5773502588272095, "step": 82 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.015063520871143375, "frac_reward_zero_std": 0.0, "grad_norm": 0.057446978986263275, "kl": 0.0005312587018124759, "learning_rate": 8.200000000000001e-06, "loss": 0.0, "num_tokens": 651315.0, "reward": 0.925000011920929, "reward_std": 0.14999999105930328, "rewards/wrapped_reward_func/mean": 0.925000011920929, "rewards/wrapped_reward_func/std": 0.15000002086162567, "step": 83 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.015245009074410164, "frac_reward_zero_std": 1.0, "grad_norm": 6.0987727920291945e-05, "kl": 0.0004321683954913169, "learning_rate": 8.3e-06, "loss": 0.0, "num_tokens": 656755.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 84 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.015426497277676952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001345462806057185, "kl": 0.0010705010499805212, "learning_rate": 8.400000000000001e-06, "loss": 0.0, "num_tokens": 662987.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 85 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.01560798548094374, "frac_reward_zero_std": 1.0, "grad_norm": 6.798814138164744e-05, "kl": 0.000827324838610366, "learning_rate": 8.500000000000002e-06, "loss": 0.0, "num_tokens": 669219.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 86 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.015789473684210527, "frac_reward_zero_std": 0.0, "grad_norm": 0.04032070189714432, "kl": 4.808310768567026e-05, "learning_rate": 8.599999999999999e-06, "loss": 0.0, "num_tokens": 682223.0, "reward": 0.8500000238418579, "reward_std": 0.30000001192092896, "rewards/wrapped_reward_func/mean": 0.8500000238418579, "rewards/wrapped_reward_func/std": 0.29999998211860657, "step": 87 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.015970961887477313, "frac_reward_zero_std": 0.0, "grad_norm": 0.17794743180274963, "kl": 0.0004364639171399176, "learning_rate": 8.7e-06, "loss": 0.0, "num_tokens": 695227.0, "reward": 0.25, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.25, "rewards/wrapped_reward_func/std": 0.5, "step": 88 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.016152450090744102, "frac_reward_zero_std": 1.0, "grad_norm": 3.520472819218412e-05, "kl": 0.0004485594108700752, "learning_rate": 8.8e-06, "loss": 0.0, "num_tokens": 714707.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 89 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.016333938294010888, "frac_reward_zero_std": 1.0, "grad_norm": 6.177645991556346e-05, "kl": 0.000828720978461206, "learning_rate": 8.9e-06, "loss": 0.0, "num_tokens": 734187.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 90 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.016515426497277677, "frac_reward_zero_std": 1.0, "grad_norm": 2.813610262819566e-05, "kl": 0.00029348471434786916, "learning_rate": 9e-06, "loss": 0.0, "num_tokens": 747519.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 91 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.016696914700544463, "frac_reward_zero_std": 1.0, "grad_norm": 4.277266270946711e-05, "kl": 0.00042358177597634494, "learning_rate": 9.100000000000001e-06, "loss": 0.0, "num_tokens": 760851.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 92 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.016878402903811252, "frac_reward_zero_std": 0.0, "grad_norm": 0.1220804750919342, "kl": 0.0003108039381913841, "learning_rate": 9.2e-06, "loss": 0.0, "num_tokens": 769491.0, "reward": 0.4749999940395355, "reward_std": 0.5499999523162842, "rewards/wrapped_reward_func/mean": 0.4749999940395355, "rewards/wrapped_reward_func/std": 0.5499999523162842, "step": 93 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.01705989110707804, "frac_reward_zero_std": 0.0, "grad_norm": 0.02356579713523388, "kl": 0.0006888127536512911, "learning_rate": 9.3e-06, "loss": 0.0, "num_tokens": 778131.0, "reward": 0.9750000238418579, "reward_std": 0.050000011920928955, "rewards/wrapped_reward_func/mean": 0.9750000238418579, "rewards/wrapped_reward_func/std": 0.050000011920928955, "step": 94 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.017241379310344827, "frac_reward_zero_std": 0.0, "grad_norm": 0.16322515904903412, "kl": 0.00039913473301567137, "learning_rate": 9.4e-06, "loss": 0.0, "num_tokens": 786963.0, "reward": 0.5, "reward_std": 0.5773502588272095, "rewards/wrapped_reward_func/mean": 0.5, "rewards/wrapped_reward_func/std": 0.5773502588272095, "step": 95 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.017422867513611617, "frac_reward_zero_std": 0.0, "grad_norm": 0.09532254934310913, "kl": 0.0002607015485409647, "learning_rate": 9.5e-06, "loss": 0.0, "num_tokens": 795795.0, "reward": 0.7250000238418579, "reward_std": 0.4856267273426056, "rewards/wrapped_reward_func/mean": 0.7250000238418579, "rewards/wrapped_reward_func/std": 0.4856267273426056, "step": 96 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.017604355716878402, "frac_reward_zero_std": 1.0, "grad_norm": 4.925089524476789e-05, "kl": 0.0007094889879226685, "learning_rate": 9.600000000000001e-06, "loss": 0.0, "num_tokens": 805451.0, "reward": 0.699999988079071, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.699999988079071, "rewards/wrapped_reward_func/std": 0.0, "step": 97 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.017785843920145192, "frac_reward_zero_std": 0.0, "grad_norm": 0.026672758162021637, "kl": 0.0007368359947577119, "learning_rate": 9.7e-06, "loss": 0.0, "num_tokens": 815107.0, "reward": 0.75, "reward_std": 0.09999998658895493, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.10000000149011612, "step": 98 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.017967332123411978, "frac_reward_zero_std": 0.0, "grad_norm": 0.01951715722680092, "kl": 0.00044461549259722233, "learning_rate": 9.800000000000001e-06, "loss": 0.0, "num_tokens": 822091.0, "reward": 0.9750000238418579, "reward_std": 0.050000011920928955, "rewards/wrapped_reward_func/mean": 0.9750000238418579, "rewards/wrapped_reward_func/std": 0.050000011920928955, "step": 99 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.018148820326678767, "frac_reward_zero_std": 0.0, "grad_norm": 0.13984861969947815, "kl": 0.0018605087825562805, "learning_rate": 9.900000000000002e-06, "loss": 0.0, "num_tokens": 829075.0, "reward": 0.75, "reward_std": 0.43588986992836, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.43588986992836, "step": 100 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.018330308529945553, "frac_reward_zero_std": 0.0, "grad_norm": 0.13625653088092804, "kl": 0.000509134610183537, "learning_rate": 1e-05, "loss": 0.0, "num_tokens": 840471.0, "reward": 0.4000000059604645, "reward_std": 0.46188023686408997, "rewards/wrapped_reward_func/mean": 0.4000000059604645, "rewards/wrapped_reward_func/std": 0.46188023686408997, "step": 101 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.018511796733212342, "frac_reward_zero_std": 0.0, "grad_norm": 0.11254780739545822, "kl": 0.00037674958002753556, "learning_rate": 1.0100000000000002e-05, "loss": 0.0, "num_tokens": 851867.0, "reward": 0.625, "reward_std": 0.3499999940395355, "rewards/wrapped_reward_func/mean": 0.625, "rewards/wrapped_reward_func/std": 0.3499999940395355, "step": 102 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.018693284936479128, "frac_reward_zero_std": 1.0, "grad_norm": 0.000105805549537763, "kl": 0.0008986917091533542, "learning_rate": 1.02e-05, "loss": 0.0, "num_tokens": 857599.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 103 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.018874773139745917, "frac_reward_zero_std": 1.0, "grad_norm": 4.886059105047025e-05, "kl": 0.0004961963859386742, "learning_rate": 1.03e-05, "loss": 0.0, "num_tokens": 863331.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 104 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.019056261343012703, "frac_reward_zero_std": 0.0, "grad_norm": 0.13501448929309845, "kl": 0.00045572902308776975, "learning_rate": 1.04e-05, "loss": 0.0, "num_tokens": 869519.0, "reward": 0.6000000238418579, "reward_std": 0.4898979663848877, "rewards/wrapped_reward_func/mean": 0.6000000238418579, "rewards/wrapped_reward_func/std": 0.4898979365825653, "step": 105 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.019237749546279492, "frac_reward_zero_std": 0.0, "grad_norm": 0.12174779176712036, "kl": 0.0004409366811159998, "learning_rate": 1.05e-05, "loss": 0.0, "num_tokens": 875707.0, "reward": 0.675000011920929, "reward_std": 0.471699059009552, "rewards/wrapped_reward_func/mean": 0.675000011920929, "rewards/wrapped_reward_func/std": 0.471699059009552, "step": 106 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.019419237749546278, "frac_reward_zero_std": 0.0, "grad_norm": 0.1761677861213684, "kl": 0.0006832558719906956, "learning_rate": 1.06e-05, "loss": 0.0, "num_tokens": 881355.0, "reward": 0.3499999940395355, "reward_std": 0.47258153557777405, "rewards/wrapped_reward_func/mean": 0.3499999940395355, "rewards/wrapped_reward_func/std": 0.47258156538009644, "step": 107 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.019600725952813067, "frac_reward_zero_std": 0.0, "grad_norm": 0.13886089622974396, "kl": 0.0006486559577751905, "learning_rate": 1.0700000000000001e-05, "loss": 0.0, "num_tokens": 887003.0, "reward": 0.5249999761581421, "reward_std": 0.42720019817352295, "rewards/wrapped_reward_func/mean": 0.5249999761581421, "rewards/wrapped_reward_func/std": 0.42720016837120056, "step": 108 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.019782214156079853, "frac_reward_zero_std": 0.0, "grad_norm": 0.06477871537208557, "kl": 0.0006253777828533202, "learning_rate": 1.08e-05, "loss": 0.0, "num_tokens": 898807.0, "reward": 0.125, "reward_std": 0.18929694592952728, "rewards/wrapped_reward_func/mean": 0.125, "rewards/wrapped_reward_func/std": 0.18929694592952728, "step": 109 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.019963702359346643, "frac_reward_zero_std": 0.0, "grad_norm": 0.07767630368471146, "kl": 0.000735009991331026, "learning_rate": 1.09e-05, "loss": 0.0, "num_tokens": 910611.0, "reward": 0.10000000149011612, "reward_std": 0.20000000298023224, "rewards/wrapped_reward_func/mean": 0.10000000149011612, "rewards/wrapped_reward_func/std": 0.20000000298023224, "step": 110 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.020145190562613432, "frac_reward_zero_std": 1.0, "grad_norm": 0.00014764619118068367, "kl": 0.0009202146320603788, "learning_rate": 1.1000000000000001e-05, "loss": 0.0, "num_tokens": 916031.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 111 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.020326678765880218, "frac_reward_zero_std": 0.0, "grad_norm": 0.14504970610141754, "kl": 0.0008864243573043495, "learning_rate": 1.11e-05, "loss": 0.0, "num_tokens": 921451.0, "reward": 0.75, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.5, "step": 112 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.020508166969147007, "frac_reward_zero_std": 1.0, "grad_norm": 0.00010061580542242154, "kl": 0.0007461362692993134, "learning_rate": 1.1200000000000001e-05, "loss": 0.0, "num_tokens": 924723.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 113 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.020689655172413793, "frac_reward_zero_std": 1.0, "grad_norm": 6.0571943322429433e-05, "kl": 0.000654085073620081, "learning_rate": 1.13e-05, "loss": 0.0, "num_tokens": 927995.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 114 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.020871143375680582, "frac_reward_zero_std": 1.0, "grad_norm": 7.844649371691048e-05, "kl": 0.00035969377495348454, "learning_rate": 1.1400000000000001e-05, "loss": 0.0, "num_tokens": 933735.0, "reward": 0.8999999761581421, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.8999999761581421, "rewards/wrapped_reward_func/std": 0.0, "step": 115 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.021052631578947368, "frac_reward_zero_std": 0.0, "grad_norm": 0.0793575718998909, "kl": 0.00027578978915698826, "learning_rate": 1.1500000000000002e-05, "loss": 0.0, "num_tokens": 939475.0, "reward": 0.625, "reward_std": 0.42720016837120056, "rewards/wrapped_reward_func/mean": 0.625, "rewards/wrapped_reward_func/std": 0.42720019817352295, "step": 116 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.021234119782214157, "frac_reward_zero_std": 1.0, "grad_norm": 9.023349412018433e-05, "kl": 0.0006238257919903845, "learning_rate": 1.16e-05, "loss": 0.0, "num_tokens": 947587.0, "reward": 0.8999999761581421, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.8999999761581421, "rewards/wrapped_reward_func/std": 0.0, "step": 117 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.021415607985480943, "frac_reward_zero_std": 0.0, "grad_norm": 0.0760384127497673, "kl": 0.00040308150346390903, "learning_rate": 1.1700000000000001e-05, "loss": 0.0, "num_tokens": 955699.0, "reward": 0.6749999523162842, "reward_std": 0.44999998807907104, "rewards/wrapped_reward_func/mean": 0.6749999523162842, "rewards/wrapped_reward_func/std": 0.44999998807907104, "step": 118 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.021597096188747732, "frac_reward_zero_std": 1.0, "grad_norm": 4.564837217913009e-05, "kl": 0.0007287761254701763, "learning_rate": 1.18e-05, "loss": 0.0, "num_tokens": 962903.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 119 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.021778584392014518, "frac_reward_zero_std": 1.0, "grad_norm": 5.4649950470775366e-05, "kl": 0.0005845886771567166, "learning_rate": 1.19e-05, "loss": 0.0, "num_tokens": 970107.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 120 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.021960072595281308, "frac_reward_zero_std": 1.0, "grad_norm": 6.335011858027428e-05, "kl": 0.0005868363950867206, "learning_rate": 1.2e-05, "loss": 0.0, "num_tokens": 974335.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 121 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.022141560798548093, "frac_reward_zero_std": 1.0, "grad_norm": 0.00010012676648329943, "kl": 0.0007547848508693278, "learning_rate": 1.2100000000000001e-05, "loss": 0.0, "num_tokens": 978563.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 122 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.022323049001814883, "frac_reward_zero_std": 1.0, "grad_norm": 5.035430513089523e-05, "kl": 0.0005698132736142725, "learning_rate": 1.22e-05, "loss": 0.0, "num_tokens": 984971.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 123 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.02250453720508167, "frac_reward_zero_std": 1.0, "grad_norm": 4.9255464546149597e-05, "kl": 0.0004900553321931511, "learning_rate": 1.23e-05, "loss": 0.0, "num_tokens": 991379.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 124 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.022686025408348458, "frac_reward_zero_std": 0.0, "grad_norm": 0.12324240803718567, "kl": 0.0006296962674241513, "learning_rate": 1.24e-05, "loss": 0.0, "num_tokens": 1002607.0, "reward": 0.6000000238418579, "reward_std": 0.4242640733718872, "rewards/wrapped_reward_func/mean": 0.6000000238418579, "rewards/wrapped_reward_func/std": 0.4242640733718872, "step": 125 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.022867513611615244, "frac_reward_zero_std": 0.0, "grad_norm": 0.11322531849145889, "kl": 0.0007824395433999598, "learning_rate": 1.25e-05, "loss": 0.0, "num_tokens": 1013835.0, "reward": 0.5249999761581421, "reward_std": 0.3499999940395355, "rewards/wrapped_reward_func/mean": 0.5249999761581421, "rewards/wrapped_reward_func/std": 0.3499999940395355, "step": 126 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.023049001814882033, "frac_reward_zero_std": 0.0, "grad_norm": 0.14141641557216644, "kl": 0.0006997122545726597, "learning_rate": 1.2600000000000001e-05, "loss": 0.0, "num_tokens": 1019963.0, "reward": 0.699999988079071, "reward_std": 0.3464101552963257, "rewards/wrapped_reward_func/mean": 0.699999988079071, "rewards/wrapped_reward_func/std": 0.3464101552963257, "step": 127 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.02323049001814882, "frac_reward_zero_std": 0.0, "grad_norm": 0.09403439611196518, "kl": 0.0005391051236074418, "learning_rate": 1.27e-05, "loss": 0.0, "num_tokens": 1026091.0, "reward": 0.625, "reward_std": 0.28722813725471497, "rewards/wrapped_reward_func/mean": 0.625, "rewards/wrapped_reward_func/std": 0.28722813725471497, "step": 128 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.023411978221415608, "frac_reward_zero_std": 1.0, "grad_norm": 9.992220293497667e-05, "kl": 0.0008699820609763265, "learning_rate": 1.2800000000000001e-05, "loss": 0.0, "num_tokens": 1032559.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 129 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.023593466424682397, "frac_reward_zero_std": 1.0, "grad_norm": 7.210667536128312e-05, "kl": 0.000754145992686972, "learning_rate": 1.29e-05, "loss": 0.0, "num_tokens": 1039027.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 130 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.023774954627949183, "frac_reward_zero_std": 1.0, "grad_norm": 5.1751499995589256e-05, "kl": 0.0005244244530331343, "learning_rate": 1.3000000000000001e-05, "loss": 0.0, "num_tokens": 1044715.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 131 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.023956442831215972, "frac_reward_zero_std": 0.0, "grad_norm": 0.11803828924894333, "kl": 0.00032159939291886985, "learning_rate": 1.3100000000000002e-05, "loss": 0.0, "num_tokens": 1050403.0, "reward": 0.75, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.5, "step": 132 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.02413793103448276, "frac_reward_zero_std": 0.0, "grad_norm": 0.12353762239217758, "kl": 0.0004967287532053888, "learning_rate": 1.32e-05, "loss": 0.0, "num_tokens": 1056611.0, "reward": 0.3499999940395355, "reward_std": 0.40414518117904663, "rewards/wrapped_reward_func/mean": 0.3499999940395355, "rewards/wrapped_reward_func/std": 0.40414518117904663, "step": 133 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.024319419237749548, "frac_reward_zero_std": 0.0, "grad_norm": 0.05163593217730522, "kl": 0.0005216796998865902, "learning_rate": 1.3300000000000001e-05, "loss": 0.0, "num_tokens": 1062819.0, "reward": 0.625, "reward_std": 0.14999999105930328, "rewards/wrapped_reward_func/mean": 0.625, "rewards/wrapped_reward_func/std": 0.14999999105930328, "step": 134 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.024500907441016333, "frac_reward_zero_std": 1.0, "grad_norm": 0.00011294012801954523, "kl": 0.0005276227602735162, "learning_rate": 1.3400000000000002e-05, "loss": 0.0, "num_tokens": 1068855.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 135 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.024682395644283123, "frac_reward_zero_std": 1.0, "grad_norm": 6.26285036560148e-05, "kl": 0.0006142389902379364, "learning_rate": 1.3500000000000001e-05, "loss": 0.0, "num_tokens": 1074891.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 136 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.02486388384754991, "frac_reward_zero_std": 0.0, "grad_norm": 0.10082340240478516, "kl": 0.00027807854348793626, "learning_rate": 1.3600000000000002e-05, "loss": 0.0, "num_tokens": 1080343.0, "reward": 0.75, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.5, "step": 137 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.025045372050816698, "frac_reward_zero_std": 1.0, "grad_norm": 6.828713230788708e-05, "kl": 0.0007091233273968101, "learning_rate": 1.3700000000000001e-05, "loss": 0.0, "num_tokens": 1085795.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 138 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.025226860254083484, "frac_reward_zero_std": 0.0, "grad_norm": 0.06921590864658356, "kl": 0.0005263265920802951, "learning_rate": 1.3800000000000002e-05, "loss": 0.0, "num_tokens": 1101203.0, "reward": 0.30000001192092896, "reward_std": 0.20000000298023224, "rewards/wrapped_reward_func/mean": 0.30000001192092896, "rewards/wrapped_reward_func/std": 0.20000000298023224, "step": 139 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.025408348457350273, "frac_reward_zero_std": 0.0, "grad_norm": 0.11369840800762177, "kl": 0.0003280928067397326, "learning_rate": 1.3900000000000002e-05, "loss": 0.0, "num_tokens": 1116611.0, "reward": 0.550000011920929, "reward_std": 0.30000001192092896, "rewards/wrapped_reward_func/mean": 0.550000011920929, "rewards/wrapped_reward_func/std": 0.29999998211860657, "step": 140 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.02558983666061706, "frac_reward_zero_std": 1.0, "grad_norm": 0.00017296464648097754, "kl": 0.0009061727614607662, "learning_rate": 1.4000000000000001e-05, "loss": 0.0, "num_tokens": 1121795.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 141 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.025771324863883848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001108905635192059, "kl": 0.0004197128291707486, "learning_rate": 1.4099999999999999e-05, "loss": 0.0, "num_tokens": 1126979.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 142 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.025952813067150634, "frac_reward_zero_std": 0.0, "grad_norm": 0.029532575979828835, "kl": 0.0005916706868447363, "learning_rate": 1.42e-05, "loss": 0.0, "num_tokens": 1134371.0, "reward": 0.8499999642372131, "reward_std": 0.10000000149011612, "rewards/wrapped_reward_func/mean": 0.8499999642372131, "rewards/wrapped_reward_func/std": 0.10000000149011612, "step": 143 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.026134301270417423, "frac_reward_zero_std": 1.0, "grad_norm": 3.127852687612176e-05, "kl": 0.0001251205976586789, "learning_rate": 1.43e-05, "loss": 0.0, "num_tokens": 1141763.0, "reward": 0.8999999761581421, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.8999999761581421, "rewards/wrapped_reward_func/std": 0.0, "step": 144 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.02631578947368421, "frac_reward_zero_std": 0.0, "grad_norm": 0.01891416311264038, "kl": 0.0003356830566190183, "learning_rate": 1.44e-05, "loss": 0.0, "num_tokens": 1149515.0, "reward": 0.05000000074505806, "reward_std": 0.057735029608011246, "rewards/wrapped_reward_func/mean": 0.05000000074505806, "rewards/wrapped_reward_func/std": 0.057735029608011246, "step": 145 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.026497277676951, "frac_reward_zero_std": 1.0, "grad_norm": 4.150362656218931e-05, "kl": 0.00042399566154927015, "learning_rate": 1.45e-05, "loss": 0.0, "num_tokens": 1157267.0, "reward": 0.4000000059604645, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.4000000059604645, "rewards/wrapped_reward_func/std": 0.0, "step": 146 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.026678765880217784, "frac_reward_zero_std": 0.0, "grad_norm": 0.10422726720571518, "kl": 0.0007346750644501299, "learning_rate": 1.4599999999999999e-05, "loss": 0.0, "num_tokens": 1164419.0, "reward": 0.5249999761581421, "reward_std": 0.3499999940395355, "rewards/wrapped_reward_func/mean": 0.5249999761581421, "rewards/wrapped_reward_func/std": 0.3499999940395355, "step": 147 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.026860254083484573, "frac_reward_zero_std": 1.0, "grad_norm": 7.155847561080009e-05, "kl": 0.0006855344399809837, "learning_rate": 1.47e-05, "loss": 0.0, "num_tokens": 1171571.0, "reward": 0.699999988079071, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.699999988079071, "rewards/wrapped_reward_func/std": 0.0, "step": 148 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.027041742286751363, "frac_reward_zero_std": 0.0, "grad_norm": 0.13662147521972656, "kl": 0.0006457297131419182, "learning_rate": 1.48e-05, "loss": 0.0, "num_tokens": 1185543.0, "reward": 0.75, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.5, "step": 149 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.02722323049001815, "frac_reward_zero_std": 0.0, "grad_norm": 0.17860014736652374, "kl": 0.0006347236630972475, "learning_rate": 1.49e-05, "loss": 0.0, "num_tokens": 1199515.0, "reward": 0.5, "reward_std": 0.5773502588272095, "rewards/wrapped_reward_func/mean": 0.5, "rewards/wrapped_reward_func/std": 0.5773502588272095, "step": 150 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.027404718693284938, "frac_reward_zero_std": 0.0, "grad_norm": 0.09732640534639359, "kl": 0.0005127542244736105, "learning_rate": 1.5e-05, "loss": 0.0, "num_tokens": 1209831.0, "reward": 0.574999988079071, "reward_std": 0.46457868814468384, "rewards/wrapped_reward_func/mean": 0.574999988079071, "rewards/wrapped_reward_func/std": 0.46457865834236145, "step": 151 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.027586206896551724, "frac_reward_zero_std": 1.0, "grad_norm": 9.343041892861947e-05, "kl": 0.0008469194290228188, "learning_rate": 1.51e-05, "loss": 0.0, "num_tokens": 1220147.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 152 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.027767695099818513, "frac_reward_zero_std": 1.0, "grad_norm": 6.978522287681699e-05, "kl": 0.0007957836205605417, "learning_rate": 1.52e-05, "loss": 0.0, "num_tokens": 1229187.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 153 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0279491833030853, "frac_reward_zero_std": 0.0, "grad_norm": 0.17092126607894897, "kl": 0.0010071030119434, "learning_rate": 1.53e-05, "loss": 0.0, "num_tokens": 1238227.0, "reward": 0.75, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.5, "step": 154 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.028130671506352088, "frac_reward_zero_std": 0.0, "grad_norm": 0.15427365899085999, "kl": 0.0008276681764982641, "learning_rate": 1.54e-05, "loss": 0.0, "num_tokens": 1262635.0, "reward": 0.75, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.5, "step": 155 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.028312159709618874, "frac_reward_zero_std": 0.0, "grad_norm": 0.1104019284248352, "kl": 0.00032694527180865407, "learning_rate": 1.55e-05, "loss": 0.0, "num_tokens": 1287043.0, "reward": 0.5, "reward_std": 0.5773502588272095, "rewards/wrapped_reward_func/mean": 0.5, "rewards/wrapped_reward_func/std": 0.5773502588272095, "step": 156 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.028493647912885663, "frac_reward_zero_std": 0.0, "grad_norm": 0.26840680837631226, "kl": 0.0017632609233260155, "learning_rate": 1.56e-05, "loss": 0.0, "num_tokens": 1293539.0, "reward": 0.5, "reward_std": 0.5773502588272095, "rewards/wrapped_reward_func/mean": 0.5, "rewards/wrapped_reward_func/std": 0.5773502588272095, "step": 157 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.02867513611615245, "frac_reward_zero_std": 0.0, "grad_norm": 0.11549533903598785, "kl": 0.000955199560848996, "learning_rate": 1.5700000000000002e-05, "loss": 0.0, "num_tokens": 1300035.0, "reward": 0.75, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.5, "step": 158 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.02885662431941924, "frac_reward_zero_std": 1.0, "grad_norm": 0.00013358128489926457, "kl": 0.0010983232059516013, "learning_rate": 1.58e-05, "loss": 0.0, "num_tokens": 1305743.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 159 }, { "completion_length": 452.75, "completions/clipped_ratio": 0.75, "completions/max_length": 512.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 452.75, "completions/mean_terminated_length": 275.0, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.029038112522686024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001872592984000221, "kl": 0.0016651522601023316, "learning_rate": 1.59e-05, "loss": 0.0, "num_tokens": 1311214.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 160 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.029219600725952814, "frac_reward_zero_std": 1.0, "grad_norm": 0.00020183948799967766, "kl": 0.0016334459651261568, "learning_rate": 1.6000000000000003e-05, "loss": 0.0, "num_tokens": 1315794.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 161 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0294010889292196, "frac_reward_zero_std": 1.0, "grad_norm": 0.00010147599095944315, "kl": 0.0011429008154664189, "learning_rate": 1.6100000000000002e-05, "loss": 0.0, "num_tokens": 1320374.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 162 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.02958257713248639, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002572286466602236, "kl": 0.0018921304726973176, "learning_rate": 1.62e-05, "loss": 0.0, "num_tokens": 1323814.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 163 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.029764065335753175, "frac_reward_zero_std": 1.0, "grad_norm": 0.00028549053240567446, "kl": 0.0030297093326225877, "learning_rate": 1.63e-05, "loss": 0.0, "num_tokens": 1327254.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 164 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.029945553539019964, "frac_reward_zero_std": 0.0, "grad_norm": 0.06666184961795807, "kl": 0.0015580069739371538, "learning_rate": 1.6400000000000002e-05, "loss": 0.0, "num_tokens": 1336202.0, "reward": 0.925000011920929, "reward_std": 0.14999999105930328, "rewards/wrapped_reward_func/mean": 0.925000011920929, "rewards/wrapped_reward_func/std": 0.15000002086162567, "step": 165 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.03012704174228675, "frac_reward_zero_std": 1.0, "grad_norm": 0.00013120105722919106, "kl": 0.0018121229950338602, "learning_rate": 1.65e-05, "loss": 0.0, "num_tokens": 1345150.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 166 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.03030852994555354, "frac_reward_zero_std": 0.0, "grad_norm": 0.08287055790424347, "kl": 0.001847608422394842, "learning_rate": 1.66e-05, "loss": 0.0, "num_tokens": 1352722.0, "reward": 0.925000011920929, "reward_std": 0.14999999105930328, "rewards/wrapped_reward_func/mean": 0.925000011920929, "rewards/wrapped_reward_func/std": 0.15000002086162567, "step": 167 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.030490018148820328, "frac_reward_zero_std": 1.0, "grad_norm": 5.999646236887202e-05, "kl": 0.0005552320508286357, "learning_rate": 1.6700000000000003e-05, "loss": 0.0, "num_tokens": 1360294.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 168 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.030671506352087114, "frac_reward_zero_std": 0.0, "grad_norm": 0.09856101870536804, "kl": 0.0015833227953407913, "learning_rate": 1.6800000000000002e-05, "loss": 0.0, "num_tokens": 1364806.0, "reward": 0.8500000238418579, "reward_std": 0.17320507764816284, "rewards/wrapped_reward_func/mean": 0.8500000238418579, "rewards/wrapped_reward_func/std": 0.17320507764816284, "step": 169 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.030852994555353903, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001713380333967507, "kl": 0.0011462417896836996, "learning_rate": 1.69e-05, "loss": 0.0, "num_tokens": 1369318.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 170 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.03103448275862069, "frac_reward_zero_std": 1.0, "grad_norm": 0.00022610783344134688, "kl": 0.0031678732484579086, "learning_rate": 1.7000000000000003e-05, "loss": 0.0, "num_tokens": 1372722.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 171 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.03121597096188748, "frac_reward_zero_std": 1.0, "grad_norm": 0.00013466810924001038, "kl": 0.0018413604702800512, "learning_rate": 1.7100000000000002e-05, "loss": 0.0, "num_tokens": 1376126.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 172 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.03139745916515427, "frac_reward_zero_std": 0.0, "grad_norm": 0.06533285230398178, "kl": 0.002063628169707954, "learning_rate": 1.7199999999999998e-05, "loss": 0.0, "num_tokens": 1380346.0, "reward": 0.550000011920929, "reward_std": 0.17320506274700165, "rewards/wrapped_reward_func/mean": 0.550000011920929, "rewards/wrapped_reward_func/std": 0.17320507764816284, "step": 173 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.031578947368421054, "frac_reward_zero_std": 0.0, "grad_norm": 0.07684276252985, "kl": 0.002453105291351676, "learning_rate": 1.73e-05, "loss": 0.0, "num_tokens": 1384566.0, "reward": 0.550000011920929, "reward_std": 0.17320506274700165, "rewards/wrapped_reward_func/mean": 0.550000011920929, "rewards/wrapped_reward_func/std": 0.17320507764816284, "step": 174 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.03176043557168784, "frac_reward_zero_std": 1.0, "grad_norm": 0.000193728570593521, "kl": 0.002760101342573762, "learning_rate": 1.74e-05, "loss": 0.0, "num_tokens": 1387974.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 175 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.031941923774954625, "frac_reward_zero_std": 1.0, "grad_norm": 0.006914151832461357, "kl": 0.020759769715368748, "learning_rate": 1.75e-05, "loss": 0.0, "num_tokens": 1391382.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 176 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.03212341197822142, "frac_reward_zero_std": 0.0, "grad_norm": 0.12328701466321945, "kl": 0.0012807021848857403, "learning_rate": 1.76e-05, "loss": 0.0, "num_tokens": 1406614.0, "reward": 0.7749999761581421, "reward_std": 0.28722813725471497, "rewards/wrapped_reward_func/mean": 0.7749999761581421, "rewards/wrapped_reward_func/std": 0.28722813725471497, "step": 177 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.032304900181488204, "frac_reward_zero_std": 0.0, "grad_norm": 0.17080311477184296, "kl": 0.0019727882463485003, "learning_rate": 1.77e-05, "loss": 0.0, "num_tokens": 1421846.0, "reward": 0.8500000238418579, "reward_std": 0.29999998211860657, "rewards/wrapped_reward_func/mean": 0.8500000238418579, "rewards/wrapped_reward_func/std": 0.29999998211860657, "step": 178 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.03248638838475499, "frac_reward_zero_std": 1.0, "grad_norm": 9.546494402457029e-05, "kl": 0.0015494156396016479, "learning_rate": 1.78e-05, "loss": 0.0, "num_tokens": 1427770.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 179 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.032667876588021776, "frac_reward_zero_std": 1.0, "grad_norm": 9.098558075493202e-05, "kl": 0.0016139246290549636, "learning_rate": 1.79e-05, "loss": 0.0, "num_tokens": 1433694.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 180 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.03284936479128857, "frac_reward_zero_std": 1.0, "grad_norm": 0.00011545241432031617, "kl": 0.0014241196913644671, "learning_rate": 1.8e-05, "loss": 0.0, "num_tokens": 1441410.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 181 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.033030852994555354, "frac_reward_zero_std": 1.0, "grad_norm": 0.00022318978153634816, "kl": 0.0025556186446920037, "learning_rate": 1.81e-05, "loss": 0.0, "num_tokens": 1449126.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 182 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.03321234119782214, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003184578672517091, "kl": 0.00228856282774359, "learning_rate": 1.8200000000000002e-05, "loss": 0.0, "num_tokens": 1455586.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.10000000149011612, "rewards/wrapped_reward_func/std": 0.0, "step": 183 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.033393829401088926, "frac_reward_zero_std": 0.0, "grad_norm": 0.029413729906082153, "kl": 0.003175846068188548, "learning_rate": 1.83e-05, "loss": 0.0, "num_tokens": 1462046.0, "reward": 0.02500000037252903, "reward_std": 0.05000000074505806, "rewards/wrapped_reward_func/mean": 0.02500000037252903, "rewards/wrapped_reward_func/std": 0.05000000074505806, "step": 184 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.03357531760435572, "frac_reward_zero_std": 0.0, "grad_norm": 0.04223480448126793, "kl": 0.0017811767174862325, "learning_rate": 1.84e-05, "loss": 0.0, "num_tokens": 1468294.0, "reward": 0.5, "reward_std": 0.11547006666660309, "rewards/wrapped_reward_func/mean": 0.5, "rewards/wrapped_reward_func/std": 0.11547006666660309, "step": 185 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.033756805807622504, "frac_reward_zero_std": 0.0, "grad_norm": 0.05083512142300606, "kl": 0.0032943011028692126, "learning_rate": 1.85e-05, "loss": 0.0, "num_tokens": 1474542.0, "reward": 0.44999998807907104, "reward_std": 0.10000000149011612, "rewards/wrapped_reward_func/mean": 0.44999998807907104, "rewards/wrapped_reward_func/std": 0.10000000894069672, "step": 186 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.03393829401088929, "frac_reward_zero_std": 1.0, "grad_norm": 0.00017624779138714075, "kl": 0.001775700831785798, "learning_rate": 1.86e-05, "loss": 0.0, "num_tokens": 1481818.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 187 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.03411978221415608, "frac_reward_zero_std": 1.0, "grad_norm": 0.00019040869665332139, "kl": 0.0013375032576732337, "learning_rate": 1.87e-05, "loss": 0.0, "num_tokens": 1489094.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 188 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.03430127041742287, "frac_reward_zero_std": 1.0, "grad_norm": 0.00039358556387014687, "kl": 0.0023516693618148565, "learning_rate": 1.88e-05, "loss": 0.0, "num_tokens": 1494662.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 189 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.034482758620689655, "frac_reward_zero_std": 0.0, "grad_norm": 0.12289774417877197, "kl": 0.0011039938253816217, "learning_rate": 1.8900000000000002e-05, "loss": 0.0, "num_tokens": 1500230.0, "reward": 0.75, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.5, "step": 190 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.03466424682395644, "frac_reward_zero_std": 0.0, "grad_norm": 0.03303716704249382, "kl": 0.0019113918533548713, "learning_rate": 1.9e-05, "loss": 0.0, "num_tokens": 1507434.0, "reward": 0.8499999642372131, "reward_std": 0.10000000149011612, "rewards/wrapped_reward_func/mean": 0.8499999642372131, "rewards/wrapped_reward_func/std": 0.10000000149011612, "step": 191 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.03484573502722323, "frac_reward_zero_std": 0.0, "grad_norm": 0.05351991951465607, "kl": 0.002117044641636312, "learning_rate": 1.91e-05, "loss": 0.0, "num_tokens": 1514638.0, "reward": 0.8499999642372131, "reward_std": 0.10000000149011612, "rewards/wrapped_reward_func/mean": 0.8499999642372131, "rewards/wrapped_reward_func/std": 0.10000000149011612, "step": 192 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.03502722323049002, "frac_reward_zero_std": 1.0, "grad_norm": 0.00021947747154626995, "kl": 0.0022881045006215572, "learning_rate": 1.9200000000000003e-05, "loss": 0.0, "num_tokens": 1520374.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 193 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.035208711433756805, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002677013399079442, "kl": 0.002351693226955831, "learning_rate": 1.93e-05, "loss": 0.0, "num_tokens": 1526110.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 194 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.03539019963702359, "frac_reward_zero_std": 1.0, "grad_norm": 0.00019267959578428417, "kl": 0.0010431880655232817, "learning_rate": 1.94e-05, "loss": 0.0, "num_tokens": 1536094.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 195 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.035571687840290384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005001563695259392, "kl": 0.0038000678177922964, "learning_rate": 1.9500000000000003e-05, "loss": 0.0, "num_tokens": 1546078.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 196 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.03575317604355717, "frac_reward_zero_std": 0.0, "grad_norm": 0.05683290213346481, "kl": 0.0013576410710811615, "learning_rate": 1.9600000000000002e-05, "loss": 0.0, "num_tokens": 1551506.0, "reward": 0.625, "reward_std": 0.14999999105930328, "rewards/wrapped_reward_func/mean": 0.625, "rewards/wrapped_reward_func/std": 0.14999999105930328, "step": 197 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.035934664246823955, "frac_reward_zero_std": 0.0, "grad_norm": 0.08140730112791061, "kl": 0.00261079816846177, "learning_rate": 1.97e-05, "loss": 0.0, "num_tokens": 1556934.0, "reward": 0.44999998807907104, "reward_std": 0.33166247606277466, "rewards/wrapped_reward_func/mean": 0.44999998807907104, "rewards/wrapped_reward_func/std": 0.33166247606277466, "step": 198 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.03611615245009074, "frac_reward_zero_std": 0.0, "grad_norm": 0.05183498561382294, "kl": 0.0023234139662235975, "learning_rate": 1.9800000000000004e-05, "loss": 0.0, "num_tokens": 1563198.0, "reward": 0.7999999523162842, "reward_std": 0.1154700443148613, "rewards/wrapped_reward_func/mean": 0.7999999523162842, "rewards/wrapped_reward_func/std": 0.1154700443148613, "step": 199 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.036297640653357534, "frac_reward_zero_std": 0.0, "grad_norm": 0.03557595610618591, "kl": 0.001098328037187457, "learning_rate": 1.9900000000000003e-05, "loss": 0.0, "num_tokens": 1569462.0, "reward": 0.824999988079071, "reward_std": 0.09574270248413086, "rewards/wrapped_reward_func/mean": 0.824999988079071, "rewards/wrapped_reward_func/std": 0.09574270248413086, "step": 200 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.03647912885662432, "frac_reward_zero_std": 0.0, "grad_norm": 0.09482787549495697, "kl": 0.0037172712618485093, "learning_rate": 2e-05, "loss": 0.0, "num_tokens": 1581478.0, "reward": 0.10000000149011612, "reward_std": 0.20000000298023224, "rewards/wrapped_reward_func/mean": 0.10000000149011612, "rewards/wrapped_reward_func/std": 0.20000000298023224, "step": 201 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.036660617059891105, "frac_reward_zero_std": 0.0, "grad_norm": 0.09073001891374588, "kl": 0.003026353078894317, "learning_rate": 2.01e-05, "loss": 0.0, "num_tokens": 1593494.0, "reward": 0.10000000149011612, "reward_std": 0.20000000298023224, "rewards/wrapped_reward_func/mean": 0.10000000149011612, "rewards/wrapped_reward_func/std": 0.20000000298023224, "step": 202 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.03684210526315789, "frac_reward_zero_std": 1.0, "grad_norm": 0.00022502498177345842, "kl": 0.003138842643238604, "learning_rate": 2.0200000000000003e-05, "loss": 0.0, "num_tokens": 1599874.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 203 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.037023593466424684, "frac_reward_zero_std": 1.0, "grad_norm": 0.00017925114661920816, "kl": 0.0022263480350375175, "learning_rate": 2.0300000000000002e-05, "loss": 0.0, "num_tokens": 1606254.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 204 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.03720508166969147, "frac_reward_zero_std": 0.0, "grad_norm": 0.4317794442176819, "kl": 0.004235577420331538, "learning_rate": 2.04e-05, "loss": 0.0, "num_tokens": 1612882.0, "reward": 0.25, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.25, "rewards/wrapped_reward_func/std": 0.5, "step": 205 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.037386569872958256, "frac_reward_zero_std": 0.0, "grad_norm": 0.2670785188674927, "kl": 0.004196037771180272, "learning_rate": 2.05e-05, "loss": 0.0, "num_tokens": 1619510.0, "reward": 0.22499999403953552, "reward_std": 0.44999998807907104, "rewards/wrapped_reward_func/mean": 0.22499999403953552, "rewards/wrapped_reward_func/std": 0.44999995827674866, "step": 206 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.03756805807622505, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002623483305796981, "kl": 0.003821979509666562, "learning_rate": 2.06e-05, "loss": 0.0, "num_tokens": 1624994.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 207 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.037749546279491834, "frac_reward_zero_std": 1.0, "grad_norm": 0.000310488831019029, "kl": 0.004578559193760157, "learning_rate": 2.07e-05, "loss": 0.0, "num_tokens": 1630478.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 208 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.03793103448275862, "frac_reward_zero_std": 1.0, "grad_norm": 0.00017046586435753852, "kl": 0.0019413677509874105, "learning_rate": 2.08e-05, "loss": 0.0, "num_tokens": 1647698.0, "reward": 0.8999999761581421, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.8999999761581421, "rewards/wrapped_reward_func/std": 0.0, "step": 209 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.038112522686025406, "frac_reward_zero_std": 0.0, "grad_norm": 0.13131605088710785, "kl": 0.0015305030392482877, "learning_rate": 2.09e-05, "loss": 0.0, "num_tokens": 1664918.0, "reward": 0.6749999523162842, "reward_std": 0.44999998807907104, "rewards/wrapped_reward_func/mean": 0.6749999523162842, "rewards/wrapped_reward_func/std": 0.44999998807907104, "step": 210 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0382940108892922, "frac_reward_zero_std": 0.0, "grad_norm": 0.09632514417171478, "kl": 0.004235484870150685, "learning_rate": 2.1e-05, "loss": 0.0, "num_tokens": 1676646.0, "reward": 0.7749999761581421, "reward_std": 0.15000000596046448, "rewards/wrapped_reward_func/mean": 0.7749999761581421, "rewards/wrapped_reward_func/std": 0.15000002086162567, "step": 211 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.038475499092558985, "frac_reward_zero_std": 0.0, "grad_norm": 0.053336624056100845, "kl": 0.0028316440293565392, "learning_rate": 2.11e-05, "loss": 0.0, "num_tokens": 1688374.0, "reward": 0.7749999761581421, "reward_std": 0.15000000596046448, "rewards/wrapped_reward_func/mean": 0.7749999761581421, "rewards/wrapped_reward_func/std": 0.15000002086162567, "step": 212 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.03865698729582577, "frac_reward_zero_std": 0.0, "grad_norm": 0.11165022104978561, "kl": 0.005199836217798293, "learning_rate": 2.12e-05, "loss": 0.0, "num_tokens": 1693206.0, "reward": 0.550000011920929, "reward_std": 0.17320506274700165, "rewards/wrapped_reward_func/mean": 0.550000011920929, "rewards/wrapped_reward_func/std": 0.17320507764816284, "step": 213 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.038838475499092556, "frac_reward_zero_std": 0.0, "grad_norm": 0.11771408468484879, "kl": 0.008155649062246084, "learning_rate": 2.13e-05, "loss": 0.0, "num_tokens": 1698038.0, "reward": 0.550000011920929, "reward_std": 0.17320506274700165, "rewards/wrapped_reward_func/mean": 0.550000011920929, "rewards/wrapped_reward_func/std": 0.17320507764816284, "step": 214 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.03901996370235935, "frac_reward_zero_std": 0.0, "grad_norm": 0.06154588982462883, "kl": 0.003949125297367573, "learning_rate": 2.1400000000000002e-05, "loss": 0.0, "num_tokens": 1711582.0, "reward": 0.824999988079071, "reward_std": 0.15000000596046448, "rewards/wrapped_reward_func/mean": 0.824999988079071, "rewards/wrapped_reward_func/std": 0.15000000596046448, "step": 215 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.039201451905626135, "frac_reward_zero_std": 0.0, "grad_norm": 0.1364002674818039, "kl": 0.0028250747127458453, "learning_rate": 2.15e-05, "loss": 0.0, "num_tokens": 1725126.0, "reward": 0.7749999761581421, "reward_std": 0.2872281074523926, "rewards/wrapped_reward_func/mean": 0.7749999761581421, "rewards/wrapped_reward_func/std": 0.28722813725471497, "step": 216 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.03938294010889292, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005236301221884787, "kl": 0.006152130430564284, "learning_rate": 2.16e-05, "loss": 0.0, "num_tokens": 1739382.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 217 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.039564428312159707, "frac_reward_zero_std": 1.0, "grad_norm": 0.00047670365893281996, "kl": 0.005807934561744332, "learning_rate": 2.1700000000000002e-05, "loss": 0.0, "num_tokens": 1753638.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 218 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0397459165154265, "frac_reward_zero_std": 1.0, "grad_norm": 0.00037056164001114666, "kl": 0.00487659964710474, "learning_rate": 2.18e-05, "loss": 0.0, "num_tokens": 1763346.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.10000000149011612, "rewards/wrapped_reward_func/std": 0.0, "step": 219 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.039927404718693285, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002509787445887923, "kl": 0.0032497502397745848, "learning_rate": 2.19e-05, "loss": 0.0, "num_tokens": 1773054.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.10000000149011612, "rewards/wrapped_reward_func/std": 0.0, "step": 220 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.04010889292196007, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011058042291551828, "kl": 0.008516516303643584, "learning_rate": 2.2000000000000003e-05, "loss": 0.0, "num_tokens": 1777986.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 221 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.040290381125226864, "frac_reward_zero_std": 1.0, "grad_norm": 0.00025052562705241144, "kl": 0.0028456407599151134, "learning_rate": 2.2100000000000002e-05, "loss": 0.0, "num_tokens": 1782918.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 222 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.04047186932849365, "frac_reward_zero_std": 0.0, "grad_norm": 0.17227114737033844, "kl": 0.005359752103686333, "learning_rate": 2.22e-05, "loss": 0.0, "num_tokens": 1790582.0, "reward": 0.5249999761581421, "reward_std": 0.3499999940395355, "rewards/wrapped_reward_func/mean": 0.5249999761581421, "rewards/wrapped_reward_func/std": 0.3499999940395355, "step": 223 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.040653357531760435, "frac_reward_zero_std": 0.0, "grad_norm": 0.11551225185394287, "kl": 0.00462198187597096, "learning_rate": 2.23e-05, "loss": 0.0, "num_tokens": 1798246.0, "reward": 0.20000000298023224, "reward_std": 0.23094011843204498, "rewards/wrapped_reward_func/mean": 0.20000000298023224, "rewards/wrapped_reward_func/std": 0.23094011843204498, "step": 224 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.04083484573502722, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006947724614292383, "kl": 0.004146949853748083, "learning_rate": 2.2400000000000002e-05, "loss": 0.0, "num_tokens": 1813114.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 225 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.041016333938294014, "frac_reward_zero_std": 1.0, "grad_norm": 0.00042997009586542845, "kl": 0.006564317271113396, "learning_rate": 2.25e-05, "loss": 0.0, "num_tokens": 1827982.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 226 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0411978221415608, "frac_reward_zero_std": 0.0, "grad_norm": 0.09984061866998672, "kl": 0.0022822237806394696, "learning_rate": 2.26e-05, "loss": 0.0, "num_tokens": 1833294.0, "reward": 0.7749999761581421, "reward_std": 0.15000000596046448, "rewards/wrapped_reward_func/mean": 0.7749999761581421, "rewards/wrapped_reward_func/std": 0.15000002086162567, "step": 227 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.041379310344827586, "frac_reward_zero_std": 0.0, "grad_norm": 0.05264453962445259, "kl": 0.0034198558423668146, "learning_rate": 2.2700000000000003e-05, "loss": 0.0, "num_tokens": 1838606.0, "reward": 0.7749999761581421, "reward_std": 0.15000000596046448, "rewards/wrapped_reward_func/mean": 0.7749999761581421, "rewards/wrapped_reward_func/std": 0.15000002086162567, "step": 228 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.04156079854809437, "frac_reward_zero_std": 0.0, "grad_norm": 0.22375784814357758, "kl": 0.006633127108216286, "learning_rate": 2.2800000000000002e-05, "loss": 0.0, "num_tokens": 1844690.0, "reward": 0.4000000059604645, "reward_std": 0.24494898319244385, "rewards/wrapped_reward_func/mean": 0.4000000059604645, "rewards/wrapped_reward_func/std": 0.24494896829128265, "step": 229 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.041742286751361164, "frac_reward_zero_std": 0.0, "grad_norm": 0.1308196634054184, "kl": 0.007162267807871103, "learning_rate": 2.29e-05, "loss": 0.0, "num_tokens": 1850774.0, "reward": 0.4000000059604645, "reward_std": 0.24494898319244385, "rewards/wrapped_reward_func/mean": 0.4000000059604645, "rewards/wrapped_reward_func/std": 0.24494896829128265, "step": 230 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.04192377495462795, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006736588547937572, "kl": 0.005774754099547863, "learning_rate": 2.3000000000000003e-05, "loss": 0.0, "num_tokens": 1858522.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 231 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.042105263157894736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005917900707572699, "kl": 0.009072968736290932, "learning_rate": 2.3100000000000002e-05, "loss": 0.0, "num_tokens": 1866270.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 232 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.04228675136116152, "frac_reward_zero_std": 1.0, "grad_norm": 0.00039625843055546284, "kl": 0.007587425410747528, "learning_rate": 2.32e-05, "loss": 0.0, "num_tokens": 1873458.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 233 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.042468239564428314, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002589465584605932, "kl": 0.0033280347706750035, "learning_rate": 2.3300000000000004e-05, "loss": 0.0, "num_tokens": 1880646.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 234 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0426497277676951, "frac_reward_zero_std": 1.0, "grad_norm": 0.00041387169039808214, "kl": 0.007730984594672918, "learning_rate": 2.3400000000000003e-05, "loss": 0.0, "num_tokens": 1889894.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 235 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.042831215970961886, "frac_reward_zero_std": 1.0, "grad_norm": 0.00032953114714473486, "kl": 0.00495121581479907, "learning_rate": 2.35e-05, "loss": 0.0, "num_tokens": 1899142.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 236 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.04301270417422867, "frac_reward_zero_std": 0.0, "grad_norm": 0.2560548782348633, "kl": 0.007499723695218563, "learning_rate": 2.36e-05, "loss": 0.0, "num_tokens": 1906006.0, "reward": 0.75, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.5, "step": 237 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.043194192377495465, "frac_reward_zero_std": 0.0, "grad_norm": 0.2330034077167511, "kl": 0.01039757439866662, "learning_rate": 2.37e-05, "loss": 0.0, "num_tokens": 1912870.0, "reward": 0.75, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.5, "step": 238 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.04337568058076225, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004194115463178605, "kl": 0.007040749303996563, "learning_rate": 2.38e-05, "loss": 0.0, "num_tokens": 1917738.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 239 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.043557168784029036, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004396011936478317, "kl": 0.004946030676364899, "learning_rate": 2.39e-05, "loss": 0.0, "num_tokens": 1922606.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 240 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.04373865698729583, "frac_reward_zero_std": 0.0, "grad_norm": 0.06653384119272232, "kl": 0.012606119271367788, "learning_rate": 2.4e-05, "loss": 0.0, "num_tokens": 1932730.0, "reward": 0.75, "reward_std": 0.09999998658895493, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.10000000149011612, "step": 241 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.043920145190562615, "frac_reward_zero_std": 0.0, "grad_norm": 0.05922889709472656, "kl": 0.0047608716413378716, "learning_rate": 2.41e-05, "loss": 0.0, "num_tokens": 1942854.0, "reward": 0.7999999523162842, "reward_std": 0.1154700443148613, "rewards/wrapped_reward_func/mean": 0.7999999523162842, "rewards/wrapped_reward_func/std": 0.1154700443148613, "step": 242 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0441016333938294, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010180396493524313, "kl": 0.006502371747046709, "learning_rate": 2.4200000000000002e-05, "loss": 0.0, "num_tokens": 1955734.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 243 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.04428312159709619, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006592859863303602, "kl": 0.006761995842680335, "learning_rate": 2.43e-05, "loss": 0.0, "num_tokens": 1968614.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 244 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.04446460980036298, "frac_reward_zero_std": 1.0, "grad_norm": 0.00046624403330497444, "kl": 0.007407110650092363, "learning_rate": 2.44e-05, "loss": 0.0, "num_tokens": 1973282.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 245 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.044646098003629765, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005669986712746322, "kl": 0.007374822162091732, "learning_rate": 2.45e-05, "loss": 0.0, "num_tokens": 1977950.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 246 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.04482758620689655, "frac_reward_zero_std": 0.0, "grad_norm": 0.3110150396823883, "kl": 0.02669902052730322, "learning_rate": 2.46e-05, "loss": 0.0, "num_tokens": 1986934.0, "reward": 0.7250000238418579, "reward_std": 0.4856267273426056, "rewards/wrapped_reward_func/mean": 0.7250000238418579, "rewards/wrapped_reward_func/std": 0.485626757144928, "step": 247 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.04500907441016334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004788596706930548, "kl": 0.018138015642762184, "learning_rate": 2.47e-05, "loss": 0.0, "num_tokens": 1995918.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 248 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.04519056261343013, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020117023959755898, "kl": 0.01394237158820033, "learning_rate": 2.48e-05, "loss": 0.0, "num_tokens": 1999978.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 249 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.045372050816696916, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007117357454262674, "kl": 0.006477365270256996, "learning_rate": 2.4900000000000002e-05, "loss": 0.0, "num_tokens": 2004038.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 250 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0455535390199637, "frac_reward_zero_std": 0.0, "grad_norm": 0.30316224694252014, "kl": 0.035296861082315445, "learning_rate": 2.5e-05, "loss": 0.0, "num_tokens": 2015038.0, "reward": 0.2750000059604645, "reward_std": 0.3403429687023163, "rewards/wrapped_reward_func/mean": 0.2750000059604645, "rewards/wrapped_reward_func/std": 0.3403429687023163, "step": 251 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.04573502722323049, "frac_reward_zero_std": 0.0, "grad_norm": 0.3052361011505127, "kl": 0.0107593082357198, "learning_rate": 2.51e-05, "loss": 0.0, "num_tokens": 2026038.0, "reward": 0.44999998807907104, "reward_std": 0.33166247606277466, "rewards/wrapped_reward_func/mean": 0.44999998807907104, "rewards/wrapped_reward_func/std": 0.33166247606277466, "step": 252 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.04591651542649728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007278493139892817, "kl": 0.01222587376832962, "learning_rate": 2.5200000000000003e-05, "loss": 0.0, "num_tokens": 2049866.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 253 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.046098003629764066, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007231617928482592, "kl": 0.009255616925656796, "learning_rate": 2.5300000000000002e-05, "loss": 0.0, "num_tokens": 2073694.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 254 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.04627949183303085, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005754604935646057, "kl": 0.01046685129404068, "learning_rate": 2.54e-05, "loss": 0.0, "num_tokens": 2084138.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 255 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.04646098003629764, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008364859386347234, "kl": 0.010537022491917014, "learning_rate": 2.5500000000000003e-05, "loss": 0.0, "num_tokens": 2094582.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 256 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.04664246823956443, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013102985685691237, "kl": 0.020654598250985146, "learning_rate": 2.5600000000000002e-05, "loss": 0.0, "num_tokens": 2100822.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 257 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.046823956442831216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010256675304844975, "kl": 0.021308658644557, "learning_rate": 2.57e-05, "loss": 0.0, "num_tokens": 2107062.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 258 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.047005444646098, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015717553906142712, "kl": 0.025629907846450806, "learning_rate": 2.58e-05, "loss": 0.0, "num_tokens": 2113282.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 259 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.047186932849364795, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008985342574305832, "kl": 0.017800359521061182, "learning_rate": 2.5900000000000003e-05, "loss": 0.0, "num_tokens": 2119502.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 260 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.04736842105263158, "frac_reward_zero_std": 0.0, "grad_norm": 0.02445174939930439, "kl": 0.020573794841766357, "learning_rate": 2.6000000000000002e-05, "loss": 0.0, "num_tokens": 2127190.0, "reward": 0.9249999523162842, "reward_std": 0.050000011920928955, "rewards/wrapped_reward_func/mean": 0.9249999523162842, "rewards/wrapped_reward_func/std": 0.050000011920928955, "step": 261 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.047549909255898366, "frac_reward_zero_std": 0.0, "grad_norm": 0.024035824462771416, "kl": 0.013679493684321642, "learning_rate": 2.61e-05, "loss": 0.0, "num_tokens": 2134878.0, "reward": 0.9750000238418579, "reward_std": 0.050000011920928955, "rewards/wrapped_reward_func/mean": 0.9750000238418579, "rewards/wrapped_reward_func/std": 0.050000011920928955, "step": 262 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.04773139745916515, "frac_reward_zero_std": 1.0, "grad_norm": 0.001108061522245407, "kl": 0.01577558647841215, "learning_rate": 2.6200000000000003e-05, "loss": 0.0, "num_tokens": 2140454.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 263 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.047912885662431945, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005275365547277033, "kl": 0.012424001470208168, "learning_rate": 2.6300000000000002e-05, "loss": 0.0, "num_tokens": 2146030.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 264 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.04809437386569873, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019375307019799948, "kl": 0.0127324762288481, "learning_rate": 2.64e-05, "loss": 0.0, "num_tokens": 2153714.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 265 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.04827586206896552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004436778253875673, "kl": 0.006016455823555589, "learning_rate": 2.6500000000000004e-05, "loss": 0.0, "num_tokens": 2161398.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 266 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0484573502722323, "frac_reward_zero_std": 1.0, "grad_norm": 0.001060481066815555, "kl": 0.01608826220035553, "learning_rate": 2.6600000000000003e-05, "loss": 0.0, "num_tokens": 2168782.0, "reward": 0.699999988079071, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.699999988079071, "rewards/wrapped_reward_func/std": 0.0, "step": 267 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.048638838475499095, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008203218458220363, "kl": 0.01387950824573636, "learning_rate": 2.6700000000000002e-05, "loss": 0.0, "num_tokens": 2176166.0, "reward": 0.699999988079071, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.699999988079071, "rewards/wrapped_reward_func/std": 0.0, "step": 268 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.04882032667876588, "frac_reward_zero_std": 0.0, "grad_norm": 0.13092558085918427, "kl": 0.007929029874503613, "learning_rate": 2.6800000000000004e-05, "loss": 0.0, "num_tokens": 2184558.0, "reward": 0.20000000298023224, "reward_std": 0.23094011843204498, "rewards/wrapped_reward_func/mean": 0.20000000298023224, "rewards/wrapped_reward_func/std": 0.23094011843204498, "step": 269 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.04900181488203267, "frac_reward_zero_std": 0.0, "grad_norm": 0.1479121893644333, "kl": 0.010357076302170753, "learning_rate": 2.6900000000000003e-05, "loss": 0.0, "num_tokens": 2192950.0, "reward": 0.20000000298023224, "reward_std": 0.23094011843204498, "rewards/wrapped_reward_func/mean": 0.20000000298023224, "rewards/wrapped_reward_func/std": 0.23094011843204498, "step": 270 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.04918330308529945, "frac_reward_zero_std": 0.0, "grad_norm": 0.07094156742095947, "kl": 0.009637881070375443, "learning_rate": 2.7000000000000002e-05, "loss": 0.0, "num_tokens": 2208238.0, "reward": 0.7999999523162842, "reward_std": 0.1154700443148613, "rewards/wrapped_reward_func/mean": 0.7999999523162842, "rewards/wrapped_reward_func/std": 0.1154700443148613, "step": 271 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.049364791288566245, "frac_reward_zero_std": 0.0, "grad_norm": 0.21678276360034943, "kl": 0.0043899890733882785, "learning_rate": 2.7100000000000005e-05, "loss": 0.0, "num_tokens": 2223526.0, "reward": 0.574999988079071, "reward_std": 0.39475730061531067, "rewards/wrapped_reward_func/mean": 0.574999988079071, "rewards/wrapped_reward_func/std": 0.39475730061531067, "step": 272 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.04954627949183303, "frac_reward_zero_std": 1.0, "grad_norm": 0.001012799795717001, "kl": 0.014064918272197247, "learning_rate": 2.7200000000000004e-05, "loss": 0.0, "num_tokens": 2234330.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 273 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.04972776769509982, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006078578880988061, "kl": 0.01101196650415659, "learning_rate": 2.7300000000000003e-05, "loss": 0.0, "num_tokens": 2245134.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 274 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0499092558983666, "frac_reward_zero_std": 1.0, "grad_norm": 0.001422400469891727, "kl": 0.01640591723844409, "learning_rate": 2.7400000000000002e-05, "loss": 0.0, "num_tokens": 2248534.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 275 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.050090744101633396, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008500735275447369, "kl": 0.018251688219606876, "learning_rate": 2.7500000000000004e-05, "loss": 0.0, "num_tokens": 2251934.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 276 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05027223230490018, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006154471193440259, "kl": 0.01015452016144991, "learning_rate": 2.7600000000000003e-05, "loss": 0.0, "num_tokens": 2264082.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 277 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05045372050816697, "frac_reward_zero_std": 1.0, "grad_norm": 0.00050675607053563, "kl": 0.008785805199295282, "learning_rate": 2.7700000000000002e-05, "loss": 0.0, "num_tokens": 2276230.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 278 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05063520871143376, "frac_reward_zero_std": 0.0, "grad_norm": 0.1275569349527359, "kl": 0.016788749489933252, "learning_rate": 2.7800000000000005e-05, "loss": 0.0, "num_tokens": 2283882.0, "reward": 0.625, "reward_std": 0.14999999105930328, "rewards/wrapped_reward_func/mean": 0.625, "rewards/wrapped_reward_func/std": 0.14999999105930328, "step": 279 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.050816696914700546, "frac_reward_zero_std": 0.0, "grad_norm": 0.022833753377199173, "kl": 0.00771157443523407, "learning_rate": 2.7900000000000004e-05, "loss": 0.0, "num_tokens": 2291534.0, "reward": 0.6749999523162842, "reward_std": 0.04999999329447746, "rewards/wrapped_reward_func/mean": 0.6749999523162842, "rewards/wrapped_reward_func/std": 0.04999998211860657, "step": 280 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05099818511796733, "frac_reward_zero_std": 0.0, "grad_norm": 0.08925533294677734, "kl": 0.020089538767933846, "learning_rate": 2.8000000000000003e-05, "loss": 0.0, "num_tokens": 2297702.0, "reward": 0.925000011920929, "reward_std": 0.14999999105930328, "rewards/wrapped_reward_func/mean": 0.925000011920929, "rewards/wrapped_reward_func/std": 0.15000002086162567, "step": 281 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05117967332123412, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005566634936258197, "kl": 0.02203097566962242, "learning_rate": 2.8100000000000005e-05, "loss": 0.0, "num_tokens": 2303870.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 282 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05136116152450091, "frac_reward_zero_std": 0.0, "grad_norm": 0.14055703580379486, "kl": 0.004102882696315646, "learning_rate": 2.8199999999999998e-05, "loss": 0.0, "num_tokens": 2318330.0, "reward": 0.6749999523162842, "reward_std": 0.44999998807907104, "rewards/wrapped_reward_func/mean": 0.6749999523162842, "rewards/wrapped_reward_func/std": 0.44999998807907104, "step": 283 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.051542649727767696, "frac_reward_zero_std": 1.0, "grad_norm": 0.00021112173271831125, "kl": 0.004984365892596543, "learning_rate": 2.83e-05, "loss": 0.0, "num_tokens": 2332790.0, "reward": 0.8999999761581421, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.8999999761581421, "rewards/wrapped_reward_func/std": 0.0, "step": 284 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05172413793103448, "frac_reward_zero_std": 0.0, "grad_norm": 0.031223773956298828, "kl": 0.010176204144954681, "learning_rate": 2.84e-05, "loss": 0.0, "num_tokens": 2339426.0, "reward": 0.9750000238418579, "reward_std": 0.050000011920928955, "rewards/wrapped_reward_func/mean": 0.9750000238418579, "rewards/wrapped_reward_func/std": 0.050000011920928955, "step": 285 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05190562613430127, "frac_reward_zero_std": 0.0, "grad_norm": 0.3343889117240906, "kl": 0.013731278944760561, "learning_rate": 2.8499999999999998e-05, "loss": 0.0, "num_tokens": 2346062.0, "reward": 0.7250000238418579, "reward_std": 0.4193248152732849, "rewards/wrapped_reward_func/mean": 0.7250000238418579, "rewards/wrapped_reward_func/std": 0.4193248450756073, "step": 286 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05208711433756806, "frac_reward_zero_std": 1.0, "grad_norm": 3.585683589335531e-05, "kl": 0.000875412515597418, "learning_rate": 2.86e-05, "loss": 0.0, "num_tokens": 2381022.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 287 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.052268602540834846, "frac_reward_zero_std": 1.0, "grad_norm": 3.9848629967309535e-05, "kl": 0.0008295548905152828, "learning_rate": 2.87e-05, "loss": 0.0, "num_tokens": 2415982.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 288 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05245009074410163, "frac_reward_zero_std": 0.0, "grad_norm": 0.2706444561481476, "kl": 0.012797310948371887, "learning_rate": 2.88e-05, "loss": 0.0, "num_tokens": 2423238.0, "reward": 0.75, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.5, "step": 289 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05263157894736842, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005742169450968504, "kl": 0.0103056151419878, "learning_rate": 2.8899999999999998e-05, "loss": 0.0, "num_tokens": 2430494.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 290 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05281306715063521, "frac_reward_zero_std": 1.0, "grad_norm": 0.00046230581938289106, "kl": 0.011022279504686594, "learning_rate": 2.9e-05, "loss": 0.0, "num_tokens": 2444714.0, "reward": 0.699999988079071, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.699999988079071, "rewards/wrapped_reward_func/std": 0.0, "step": 291 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.052994555353902, "frac_reward_zero_std": 0.0, "grad_norm": 0.13084134459495544, "kl": 0.007340599782764912, "learning_rate": 2.91e-05, "loss": 0.0, "num_tokens": 2458934.0, "reward": 0.5249999761581421, "reward_std": 0.3499999940395355, "rewards/wrapped_reward_func/mean": 0.5249999761581421, "rewards/wrapped_reward_func/std": 0.3499999940395355, "step": 292 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05317604355716878, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008120578713715076, "kl": 0.010235932655632496, "learning_rate": 2.9199999999999998e-05, "loss": 0.0, "num_tokens": 2466246.0, "reward": 0.4000000059604645, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.4000000059604645, "rewards/wrapped_reward_func/std": 0.0, "step": 293 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05335753176043557, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007003439241088927, "kl": 0.013434076681733131, "learning_rate": 2.93e-05, "loss": 0.0, "num_tokens": 2473558.0, "reward": 0.4000000059604645, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.4000000059604645, "rewards/wrapped_reward_func/std": 0.0, "step": 294 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05353901996370236, "frac_reward_zero_std": 1.0, "grad_norm": 0.00027735938783735037, "kl": 0.004087310051545501, "learning_rate": 2.94e-05, "loss": 0.0, "num_tokens": 2480418.0, "reward": 0.4000000059604645, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.4000000059604645, "rewards/wrapped_reward_func/std": 0.0, "step": 295 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05372050816696915, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006836861139163375, "kl": 0.011375355068594217, "learning_rate": 2.95e-05, "loss": 0.0, "num_tokens": 2487278.0, "reward": 0.4000000059604645, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.4000000059604645, "rewards/wrapped_reward_func/std": 0.0, "step": 296 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05390199637023593, "frac_reward_zero_std": 1.0, "grad_norm": 0.00047752822865732014, "kl": 0.008695818483829498, "learning_rate": 2.96e-05, "loss": 0.0, "num_tokens": 2491542.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 297 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.054083484573502726, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006049277726560831, "kl": 0.009150051977485418, "learning_rate": 2.97e-05, "loss": 0.0, "num_tokens": 2495806.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 298 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05426497277676951, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005408656434156001, "kl": 0.008558522909879684, "learning_rate": 2.98e-05, "loss": 0.0, "num_tokens": 2501902.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 299 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0544464609800363, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008840917726047337, "kl": 0.014230011263862252, "learning_rate": 2.9900000000000002e-05, "loss": 0.0, "num_tokens": 2507998.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 300 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05462794918330308, "frac_reward_zero_std": 0.0, "grad_norm": 0.024701641872525215, "kl": 0.014557956717908382, "learning_rate": 3e-05, "loss": 0.0, "num_tokens": 2512658.0, "reward": 0.9249999523162842, "reward_std": 0.050000011920928955, "rewards/wrapped_reward_func/mean": 0.9249999523162842, "rewards/wrapped_reward_func/std": 0.050000011920928955, "step": 301 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.054809437386569876, "frac_reward_zero_std": 0.0, "grad_norm": 0.27831870317459106, "kl": 0.011218202766031027, "learning_rate": 3.01e-05, "loss": 0.0, "num_tokens": 2517318.0, "reward": 0.6749999523162842, "reward_std": 0.44999998807907104, "rewards/wrapped_reward_func/mean": 0.6749999523162842, "rewards/wrapped_reward_func/std": 0.44999998807907104, "step": 302 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05499092558983666, "frac_reward_zero_std": 0.0, "grad_norm": 0.03440956771373749, "kl": 0.009571915725246072, "learning_rate": 3.02e-05, "loss": 0.0, "num_tokens": 2520950.0, "reward": 0.9750000238418579, "reward_std": 0.050000011920928955, "rewards/wrapped_reward_func/mean": 0.9750000238418579, "rewards/wrapped_reward_func/std": 0.050000011920928955, "step": 303 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05517241379310345, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005024394486099482, "kl": 0.00867360271513462, "learning_rate": 3.03e-05, "loss": 0.0, "num_tokens": 2524582.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 304 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05535390199637023, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006598635227419436, "kl": 0.007544087246060371, "learning_rate": 3.04e-05, "loss": 0.0, "num_tokens": 2530802.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 305 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.055535390199637026, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003844216698780656, "kl": 0.00951372180134058, "learning_rate": 3.05e-05, "loss": 0.0, "num_tokens": 2537022.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 306 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05571687840290381, "frac_reward_zero_std": 1.0, "grad_norm": 0.001570240594446659, "kl": 0.017347341869026423, "learning_rate": 3.06e-05, "loss": 0.0, "num_tokens": 2543882.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 307 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0558983666061706, "frac_reward_zero_std": 1.0, "grad_norm": 0.003049669088795781, "kl": 0.014745901804417372, "learning_rate": 3.07e-05, "loss": 0.0, "num_tokens": 2550742.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 308 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.056079854809437384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005457078805193305, "kl": 0.011619246564805508, "learning_rate": 3.08e-05, "loss": 0.0, "num_tokens": 2562066.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 309 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.056261343012704176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008743781363591552, "kl": 0.016567331738770008, "learning_rate": 3.09e-05, "loss": 0.0, "num_tokens": 2573390.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 310 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05644283121597096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013550127623602748, "kl": 0.014900763519108295, "learning_rate": 3.1e-05, "loss": 0.0, "num_tokens": 2580470.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 311 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05662431941923775, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007740124128758907, "kl": 0.011904142331331968, "learning_rate": 3.1100000000000004e-05, "loss": 0.0, "num_tokens": 2587550.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 312 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.056805807622504534, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006586573435924947, "kl": 0.011546493275091052, "learning_rate": 3.12e-05, "loss": 0.0, "num_tokens": 2592982.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 313 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05698729582577133, "frac_reward_zero_std": 1.0, "grad_norm": 0.002537332009524107, "kl": 0.02350751031190157, "learning_rate": 3.13e-05, "loss": 0.0, "num_tokens": 2598414.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 314 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05716878402903811, "frac_reward_zero_std": 0.0, "grad_norm": 0.043670713901519775, "kl": 0.0072309840470552444, "learning_rate": 3.1400000000000004e-05, "loss": 0.0, "num_tokens": 2603790.0, "reward": 0.8500000238418579, "reward_std": 0.09999998658895493, "rewards/wrapped_reward_func/mean": 0.8500000238418579, "rewards/wrapped_reward_func/std": 0.10000000149011612, "step": 315 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0573502722323049, "frac_reward_zero_std": 1.0, "grad_norm": 0.000633572693914175, "kl": 0.00530828651972115, "learning_rate": 3.15e-05, "loss": 0.0, "num_tokens": 2609166.0, "reward": 0.800000011920929, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.800000011920929, "rewards/wrapped_reward_func/std": 0.0, "step": 316 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05753176043557169, "frac_reward_zero_std": 1.0, "grad_norm": 0.000719416537322104, "kl": 0.012791499495506287, "learning_rate": 3.16e-05, "loss": 0.0, "num_tokens": 2616278.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 317 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05771324863883848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006333068013191223, "kl": 0.01302483631297946, "learning_rate": 3.1700000000000005e-05, "loss": 0.0, "num_tokens": 2623390.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 318 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05789473684210526, "frac_reward_zero_std": 0.0, "grad_norm": 0.13475216925144196, "kl": 0.008983689360320568, "learning_rate": 3.18e-05, "loss": 0.0, "num_tokens": 2635090.0, "reward": 0.5499999523162842, "reward_std": 0.4358898997306824, "rewards/wrapped_reward_func/mean": 0.5499999523162842, "rewards/wrapped_reward_func/std": 0.43588986992836, "step": 319 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05807622504537205, "frac_reward_zero_std": 0.0, "grad_norm": 0.14352115988731384, "kl": 0.005445819231681526, "learning_rate": 3.19e-05, "loss": 0.0, "num_tokens": 2646790.0, "reward": 0.5, "reward_std": 0.39157798886299133, "rewards/wrapped_reward_func/mean": 0.5, "rewards/wrapped_reward_func/std": 0.39157798886299133, "step": 320 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05825771324863884, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013406428042799234, "kl": 0.012928911950439215, "learning_rate": 3.2000000000000005e-05, "loss": 0.0, "num_tokens": 2658470.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 321 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05843920145190563, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006411941139958799, "kl": 0.0119607113301754, "learning_rate": 3.21e-05, "loss": 0.0, "num_tokens": 2670150.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 322 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05862068965517241, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007012159912846982, "kl": 0.018886784091591835, "learning_rate": 3.2200000000000003e-05, "loss": 0.0, "num_tokens": 2676298.0, "reward": 0.8999999761581421, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.8999999761581421, "rewards/wrapped_reward_func/std": 0.0, "step": 323 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0588021778584392, "frac_reward_zero_std": 0.0, "grad_norm": 0.04217100143432617, "kl": 0.033784933388233185, "learning_rate": 3.2300000000000006e-05, "loss": 0.0, "num_tokens": 2682446.0, "reward": 0.949999988079071, "reward_std": 0.05773504078388214, "rewards/wrapped_reward_func/mean": 0.949999988079071, "rewards/wrapped_reward_func/std": 0.05773504078388214, "step": 324 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05898366606170599, "frac_reward_zero_std": 0.0, "grad_norm": 0.2441469132900238, "kl": 0.011461753398180008, "learning_rate": 3.24e-05, "loss": 0.0, "num_tokens": 2687326.0, "reward": 0.25, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.25, "rewards/wrapped_reward_func/std": 0.5, "step": 325 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05916515426497278, "frac_reward_zero_std": 0.0, "grad_norm": 0.35362929105758667, "kl": 0.018579245544970036, "learning_rate": 3.2500000000000004e-05, "loss": 0.0, "num_tokens": 2692206.0, "reward": 0.75, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.5, "step": 326 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05934664246823956, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018086852505803108, "kl": 0.028937467373907566, "learning_rate": 3.26e-05, "loss": 0.0, "num_tokens": 2699342.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 327 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05952813067150635, "frac_reward_zero_std": 1.0, "grad_norm": 0.00046204638783819973, "kl": 0.01155528100207448, "learning_rate": 3.27e-05, "loss": 0.0, "num_tokens": 2706478.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 328 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05970961887477314, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006339769461192191, "kl": 0.013257715851068497, "learning_rate": 3.2800000000000004e-05, "loss": 0.0, "num_tokens": 2713102.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 329 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.05989110707803993, "frac_reward_zero_std": 1.0, "grad_norm": 0.00044400847400538623, "kl": 0.008935116697102785, "learning_rate": 3.29e-05, "loss": 0.0, "num_tokens": 2719726.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 330 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.060072595281306713, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005025083082728088, "kl": 0.007069399114698172, "learning_rate": 3.3e-05, "loss": 0.0, "num_tokens": 2725518.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 331 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0602540834845735, "frac_reward_zero_std": 1.0, "grad_norm": 0.000787832192145288, "kl": 0.012104997411370277, "learning_rate": 3.3100000000000005e-05, "loss": 0.0, "num_tokens": 2731310.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 332 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06043557168784029, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010536519112065434, "kl": 0.022278236225247383, "learning_rate": 3.32e-05, "loss": 0.0, "num_tokens": 2737954.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 333 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06061705989110708, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017688964726403356, "kl": 0.025696473196148872, "learning_rate": 3.33e-05, "loss": 0.0, "num_tokens": 2744598.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 334 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.060798548094373864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011481947731226683, "kl": 0.027032938785851002, "learning_rate": 3.3400000000000005e-05, "loss": 0.0, "num_tokens": 2748206.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 335 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.060980036297640657, "frac_reward_zero_std": 1.0, "grad_norm": 0.005025247577577829, "kl": 0.040659209713339806, "learning_rate": 3.35e-05, "loss": 0.0, "num_tokens": 2751814.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 336 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06116152450090744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011791929136961699, "kl": 0.024530894123017788, "learning_rate": 3.3600000000000004e-05, "loss": 0.0, "num_tokens": 2764770.0, "reward": 0.4000000059604645, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.4000000059604645, "rewards/wrapped_reward_func/std": 0.0, "step": 337 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06134301270417423, "frac_reward_zero_std": 0.0, "grad_norm": 0.200640469789505, "kl": 0.019179750233888626, "learning_rate": 3.3700000000000006e-05, "loss": 0.0, "num_tokens": 2777726.0, "reward": 0.550000011920929, "reward_std": 0.30000001192092896, "rewards/wrapped_reward_func/mean": 0.550000011920929, "rewards/wrapped_reward_func/std": 0.29999998211860657, "step": 338 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.061524500907441014, "frac_reward_zero_std": 1.0, "grad_norm": 0.001586826634593308, "kl": 0.05694180727005005, "learning_rate": 3.38e-05, "loss": 0.0001, "num_tokens": 2784806.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 339 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06170598911070781, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017507069278508425, "kl": 0.03337153885513544, "learning_rate": 3.3900000000000004e-05, "loss": 0.0, "num_tokens": 2791886.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 340 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06188747731397459, "frac_reward_zero_std": 0.0, "grad_norm": 0.2999972701072693, "kl": 0.03528993856161833, "learning_rate": 3.4000000000000007e-05, "loss": 0.0, "num_tokens": 2797958.0, "reward": 0.75, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.5, "step": 341 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06206896551724138, "frac_reward_zero_std": 0.0, "grad_norm": 0.3854820728302002, "kl": 0.04365203529596329, "learning_rate": 3.41e-05, "loss": 0.0, "num_tokens": 2804030.0, "reward": 0.75, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.5, "step": 342 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.062250453720508164, "frac_reward_zero_std": 0.0, "grad_norm": 0.11420270055532455, "kl": 0.028789179399609566, "learning_rate": 3.4200000000000005e-05, "loss": 0.0, "num_tokens": 2822802.0, "reward": 0.32499998807907104, "reward_std": 0.15000000596046448, "rewards/wrapped_reward_func/mean": 0.32499998807907104, "rewards/wrapped_reward_func/std": 0.15000000596046448, "step": 343 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06243194192377496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012524632038548589, "kl": 0.026394134387373924, "learning_rate": 3.430000000000001e-05, "loss": 0.0, "num_tokens": 2841574.0, "reward": 0.4000000059604645, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.4000000059604645, "rewards/wrapped_reward_func/std": 0.0, "step": 344 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06261343012704174, "frac_reward_zero_std": 0.0, "grad_norm": 0.03632964566349983, "kl": 0.025393545627593994, "learning_rate": 3.4399999999999996e-05, "loss": 0.0, "num_tokens": 2848954.0, "reward": 0.9750000238418579, "reward_std": 0.050000011920928955, "rewards/wrapped_reward_func/mean": 0.9750000238418579, "rewards/wrapped_reward_func/std": 0.050000011920928955, "step": 345 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06279491833030854, "frac_reward_zero_std": 0.0, "grad_norm": 0.04377472773194313, "kl": 0.019698094576597214, "learning_rate": 3.45e-05, "loss": 0.0, "num_tokens": 2856334.0, "reward": 0.949999988079071, "reward_std": 0.05773504078388214, "rewards/wrapped_reward_func/mean": 0.949999988079071, "rewards/wrapped_reward_func/std": 0.05773504078388214, "step": 346 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06297640653357532, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013300770660862327, "kl": 0.03183883521705866, "learning_rate": 3.46e-05, "loss": 0.0, "num_tokens": 2862946.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 347 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06315789473684211, "frac_reward_zero_std": 1.0, "grad_norm": 0.001675883773714304, "kl": 0.030815428122878075, "learning_rate": 3.4699999999999996e-05, "loss": 0.0, "num_tokens": 2869558.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 348 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0633393829401089, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008484208374284208, "kl": 0.012496890965849161, "learning_rate": 3.48e-05, "loss": 0.0, "num_tokens": 2873754.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 349 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06352087114337568, "frac_reward_zero_std": 1.0, "grad_norm": 0.006584591697901487, "kl": 0.03475571097806096, "learning_rate": 3.49e-05, "loss": 0.0, "num_tokens": 2877950.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 350 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06370235934664246, "frac_reward_zero_std": 0.0, "grad_norm": 0.07617805153131485, "kl": 0.02054460719227791, "learning_rate": 3.5e-05, "loss": 0.0, "num_tokens": 2883238.0, "reward": 0.20000000298023224, "reward_std": 0.23094011843204498, "rewards/wrapped_reward_func/mean": 0.20000000298023224, "rewards/wrapped_reward_func/std": 0.23094011843204498, "step": 351 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06388384754990925, "frac_reward_zero_std": 0.0, "grad_norm": 0.08944585919380188, "kl": 0.029365453869104385, "learning_rate": 3.51e-05, "loss": 0.0, "num_tokens": 2888526.0, "reward": 0.20000000298023224, "reward_std": 0.23094011843204498, "rewards/wrapped_reward_func/mean": 0.20000000298023224, "rewards/wrapped_reward_func/std": 0.23094011843204498, "step": 352 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06406533575317605, "frac_reward_zero_std": 1.0, "grad_norm": 0.001284038764424622, "kl": 0.03969292528927326, "learning_rate": 3.52e-05, "loss": 0.0, "num_tokens": 2894674.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 353 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06424682395644284, "frac_reward_zero_std": 0.0, "grad_norm": 0.30142515897750854, "kl": 0.04559899494051933, "learning_rate": 3.53e-05, "loss": 0.0, "num_tokens": 2900822.0, "reward": 0.75, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.5, "step": 354 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06442831215970962, "frac_reward_zero_std": 0.0, "grad_norm": 0.10728510469198227, "kl": 0.04953271523118019, "learning_rate": 3.54e-05, "loss": 0.0, "num_tokens": 2906314.0, "reward": 0.8499999642372131, "reward_std": 0.10000000149011612, "rewards/wrapped_reward_func/mean": 0.8499999642372131, "rewards/wrapped_reward_func/std": 0.10000000149011612, "step": 355 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06460980036297641, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016061131609603763, "kl": 0.025830541737377644, "learning_rate": 3.55e-05, "loss": 0.0, "num_tokens": 2911806.0, "reward": 0.699999988079071, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.699999988079071, "rewards/wrapped_reward_func/std": 0.0, "step": 356 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0647912885662432, "frac_reward_zero_std": 1.0, "grad_norm": 0.002768987324088812, "kl": 0.07821455597877502, "learning_rate": 3.56e-05, "loss": 0.0001, "num_tokens": 2916234.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 357 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06497277676950998, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012763128615915775, "kl": 0.06301657110452652, "learning_rate": 3.57e-05, "loss": 0.0001, "num_tokens": 2920662.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 358 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06515426497277677, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009714583866298199, "kl": 0.032316820695996284, "learning_rate": 3.58e-05, "loss": 0.0, "num_tokens": 2925110.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 359 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06533575317604355, "frac_reward_zero_std": 1.0, "grad_norm": 0.001164024812169373, "kl": 0.044055117294192314, "learning_rate": 3.59e-05, "loss": 0.0, "num_tokens": 2929558.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 360 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06551724137931035, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036972826346755028, "kl": 0.057932570576667786, "learning_rate": 3.6e-05, "loss": 0.0001, "num_tokens": 2936898.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 361 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06569872958257714, "frac_reward_zero_std": 1.0, "grad_norm": 0.003618560265749693, "kl": 0.06923622451722622, "learning_rate": 3.61e-05, "loss": 0.0001, "num_tokens": 2944238.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 362 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06588021778584392, "frac_reward_zero_std": 1.0, "grad_norm": 0.001513795810751617, "kl": 0.05245739780366421, "learning_rate": 3.62e-05, "loss": 0.0001, "num_tokens": 2950506.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 363 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06606170598911071, "frac_reward_zero_std": 1.0, "grad_norm": 0.001403735252097249, "kl": 0.04531806707382202, "learning_rate": 3.63e-05, "loss": 0.0, "num_tokens": 2956774.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 364 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0662431941923775, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008368663839064538, "kl": 0.04977698624134064, "learning_rate": 3.6400000000000004e-05, "loss": 0.0, "num_tokens": 2962198.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 365 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06642468239564428, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024627295788377523, "kl": 0.07059645466506481, "learning_rate": 3.65e-05, "loss": 0.0001, "num_tokens": 2967622.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 366 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06660617059891107, "frac_reward_zero_std": 1.0, "grad_norm": 0.002741317031905055, "kl": 0.05744955129921436, "learning_rate": 3.66e-05, "loss": 0.0001, "num_tokens": 2973186.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 367 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06678765880217785, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009200468193739653, "kl": 0.034581043757498264, "learning_rate": 3.6700000000000004e-05, "loss": 0.0, "num_tokens": 2978750.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 368 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06696914700544465, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012370485346764326, "kl": 0.06782949157059193, "learning_rate": 3.68e-05, "loss": 0.0001, "num_tokens": 2988146.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 369 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06715063520871144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019109590211883187, "kl": 0.0555072408169508, "learning_rate": 3.69e-05, "loss": 0.0001, "num_tokens": 2997542.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 370 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06733212341197822, "frac_reward_zero_std": 0.0, "grad_norm": 0.2568616271018982, "kl": 0.05095750465989113, "learning_rate": 3.7e-05, "loss": 0.0001, "num_tokens": 3000862.0, "reward": 0.2750000059604645, "reward_std": 0.4856267273426056, "rewards/wrapped_reward_func/mean": 0.2750000059604645, "rewards/wrapped_reward_func/std": 0.485626757144928, "step": 371 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06751361161524501, "frac_reward_zero_std": 0.0, "grad_norm": 0.14534391462802887, "kl": 0.054770804941654205, "learning_rate": 3.71e-05, "loss": 0.0001, "num_tokens": 3004182.0, "reward": 0.925000011920929, "reward_std": 0.14999999105930328, "rewards/wrapped_reward_func/mean": 0.925000011920929, "rewards/wrapped_reward_func/std": 0.15000002086162567, "step": 372 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0676950998185118, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018603401258587837, "kl": 0.06733610481023788, "learning_rate": 3.72e-05, "loss": 0.0001, "num_tokens": 3021478.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 373 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06787658802177858, "frac_reward_zero_std": 1.0, "grad_norm": 0.003420312190428376, "kl": 0.044396353885531425, "learning_rate": 3.73e-05, "loss": 0.0, "num_tokens": 3038774.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 374 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06805807622504537, "frac_reward_zero_std": 0.0, "grad_norm": 0.02317805588245392, "kl": 0.02406476018950343, "learning_rate": 3.74e-05, "loss": 0.0, "num_tokens": 3042466.0, "reward": 0.9750000238418579, "reward_std": 0.050000011920928955, "rewards/wrapped_reward_func/mean": 0.9750000238418579, "rewards/wrapped_reward_func/std": 0.050000011920928955, "step": 375 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06823956442831217, "frac_reward_zero_std": 1.0, "grad_norm": 0.000893576187081635, "kl": 0.01858829241245985, "learning_rate": 3.7500000000000003e-05, "loss": 0.0, "num_tokens": 3046158.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 376 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06842105263157895, "frac_reward_zero_std": 0.0, "grad_norm": 0.036286644637584686, "kl": 0.03230278193950653, "learning_rate": 3.76e-05, "loss": 0.0, "num_tokens": 3052694.0, "reward": 0.949999988079071, "reward_std": 0.05773504078388214, "rewards/wrapped_reward_func/mean": 0.949999988079071, "rewards/wrapped_reward_func/std": 0.05773504078388214, "step": 377 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06860254083484574, "frac_reward_zero_std": 0.0, "grad_norm": 0.04385578632354736, "kl": 0.0448400154709816, "learning_rate": 3.77e-05, "loss": 0.0, "num_tokens": 3059230.0, "reward": 0.9249999523162842, "reward_std": 0.050000011920928955, "rewards/wrapped_reward_func/mean": 0.9249999523162842, "rewards/wrapped_reward_func/std": 0.050000011920928955, "step": 378 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06878402903811252, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010545988334342837, "kl": 0.03396552987396717, "learning_rate": 3.7800000000000004e-05, "loss": 0.0, "num_tokens": 3065354.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 379 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06896551724137931, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013457256136462092, "kl": 0.034121397882699966, "learning_rate": 3.79e-05, "loss": 0.0, "num_tokens": 3071478.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 380 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0691470054446461, "frac_reward_zero_std": 1.0, "grad_norm": 0.00264293747022748, "kl": 0.03815429215319455, "learning_rate": 3.8e-05, "loss": 0.0, "num_tokens": 3075954.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 381 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06932849364791288, "frac_reward_zero_std": 1.0, "grad_norm": 0.000977964955382049, "kl": 0.024337527342140675, "learning_rate": 3.8100000000000005e-05, "loss": 0.0, "num_tokens": 3080430.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 382 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06950998185117967, "frac_reward_zero_std": 1.0, "grad_norm": 0.001490750815719366, "kl": 0.03354365658015013, "learning_rate": 3.82e-05, "loss": 0.0, "num_tokens": 3083678.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 383 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06969147005444647, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015330062014982104, "kl": 0.02263825200498104, "learning_rate": 3.83e-05, "loss": 0.0, "num_tokens": 3086926.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 384 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.06987295825771325, "frac_reward_zero_std": 0.0, "grad_norm": 0.0497903972864151, "kl": 0.05429598316550255, "learning_rate": 3.8400000000000005e-05, "loss": 0.0001, "num_tokens": 3094726.0, "reward": 0.949999988079071, "reward_std": 0.05773504078388214, "rewards/wrapped_reward_func/mean": 0.949999988079071, "rewards/wrapped_reward_func/std": 0.05773504078388214, "step": 385 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07005444646098004, "frac_reward_zero_std": 0.0, "grad_norm": 0.037834879010915756, "kl": 0.055259186774492264, "learning_rate": 3.85e-05, "loss": 0.0001, "num_tokens": 3102526.0, "reward": 0.9249999523162842, "reward_std": 0.050000011920928955, "rewards/wrapped_reward_func/mean": 0.9249999523162842, "rewards/wrapped_reward_func/std": 0.050000011920928955, "step": 386 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07023593466424682, "frac_reward_zero_std": 0.0, "grad_norm": 0.18350085616111755, "kl": 0.03533359803259373, "learning_rate": 3.86e-05, "loss": 0.0, "num_tokens": 3109446.0, "reward": 0.550000011920929, "reward_std": 0.29999998211860657, "rewards/wrapped_reward_func/mean": 0.550000011920929, "rewards/wrapped_reward_func/std": 0.29999998211860657, "step": 387 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07041742286751361, "frac_reward_zero_std": 0.0, "grad_norm": 0.2649555802345276, "kl": 0.04381866380572319, "learning_rate": 3.8700000000000006e-05, "loss": 0.0, "num_tokens": 3116366.0, "reward": 0.699999988079071, "reward_std": 0.3464101552963257, "rewards/wrapped_reward_func/mean": 0.699999988079071, "rewards/wrapped_reward_func/std": 0.3464101552963257, "step": 388 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0705989110707804, "frac_reward_zero_std": 0.0, "grad_norm": 0.12277352809906006, "kl": 0.028239517472684383, "learning_rate": 3.88e-05, "loss": 0.0, "num_tokens": 3121558.0, "reward": 0.10000000149011612, "reward_std": 0.20000000298023224, "rewards/wrapped_reward_func/mean": 0.10000000149011612, "rewards/wrapped_reward_func/std": 0.20000000298023224, "step": 389 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07078039927404718, "frac_reward_zero_std": 0.0, "grad_norm": 0.17623348534107208, "kl": 0.04269365407526493, "learning_rate": 3.8900000000000004e-05, "loss": 0.0, "num_tokens": 3126750.0, "reward": 0.20000000298023224, "reward_std": 0.23094011843204498, "rewards/wrapped_reward_func/mean": 0.20000000298023224, "rewards/wrapped_reward_func/std": 0.23094011843204498, "step": 390 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07096188747731398, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006486757192760706, "kl": 0.02402433380484581, "learning_rate": 3.9000000000000006e-05, "loss": 0.0, "num_tokens": 3135006.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 391 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07114337568058077, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025139874778687954, "kl": 0.04263159818947315, "learning_rate": 3.91e-05, "loss": 0.0, "num_tokens": 3143262.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 392 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07132486388384755, "frac_reward_zero_std": 1.0, "grad_norm": 0.001192178693599999, "kl": 0.028158091939985752, "learning_rate": 3.9200000000000004e-05, "loss": 0.0, "num_tokens": 3148146.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 393 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07150635208711434, "frac_reward_zero_std": 0.0, "grad_norm": 0.1296001374721527, "kl": 0.02857186086475849, "learning_rate": 3.9300000000000007e-05, "loss": 0.0, "num_tokens": 3153030.0, "reward": 0.925000011920929, "reward_std": 0.14999999105930328, "rewards/wrapped_reward_func/mean": 0.925000011920929, "rewards/wrapped_reward_func/std": 0.15000002086162567, "step": 394 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07168784029038112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017481368267908692, "kl": 0.04117462411522865, "learning_rate": 3.94e-05, "loss": 0.0, "num_tokens": 3166110.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 395 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07186932849364791, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033345045521855354, "kl": 0.05010554566979408, "learning_rate": 3.9500000000000005e-05, "loss": 0.0001, "num_tokens": 3179190.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 396 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0720508166969147, "frac_reward_zero_std": 1.0, "grad_norm": 0.001412017154507339, "kl": 0.040715740993618965, "learning_rate": 3.960000000000001e-05, "loss": 0.0, "num_tokens": 3183590.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 397 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07223230490018148, "frac_reward_zero_std": 1.0, "grad_norm": 0.00144410808570683, "kl": 0.02518117893487215, "learning_rate": 3.97e-05, "loss": 0.0, "num_tokens": 3187990.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 398 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07241379310344828, "frac_reward_zero_std": 0.0, "grad_norm": 0.04573395475745201, "kl": 0.040694489143788815, "learning_rate": 3.9800000000000005e-05, "loss": 0.0, "num_tokens": 3195594.0, "reward": 0.375, "reward_std": 0.04999999329447746, "rewards/wrapped_reward_func/mean": 0.375, "rewards/wrapped_reward_func/std": 0.05000000074505806, "step": 399 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07259528130671507, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024348394945263863, "kl": 0.04713786207139492, "learning_rate": 3.99e-05, "loss": 0.0, "num_tokens": 3203198.0, "reward": 0.4000000059604645, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.4000000059604645, "rewards/wrapped_reward_func/std": 0.0, "step": 400 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07277676950998185, "frac_reward_zero_std": 1.0, "grad_norm": 0.002348546637222171, "kl": 0.04124763235449791, "learning_rate": 4e-05, "loss": 0.0, "num_tokens": 3209930.0, "reward": 0.699999988079071, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.699999988079071, "rewards/wrapped_reward_func/std": 0.0, "step": 401 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07295825771324864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008387255948036909, "kl": 0.029846238903701305, "learning_rate": 4.0100000000000006e-05, "loss": 0.0, "num_tokens": 3216662.0, "reward": 0.699999988079071, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.699999988079071, "rewards/wrapped_reward_func/std": 0.0, "step": 402 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07313974591651543, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018622620264068246, "kl": 0.04086045082658529, "learning_rate": 4.02e-05, "loss": 0.0, "num_tokens": 3222054.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 403 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07332123411978221, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030769777949899435, "kl": 0.04654042050242424, "learning_rate": 4.0300000000000004e-05, "loss": 0.0, "num_tokens": 3227446.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 404 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.073502722323049, "frac_reward_zero_std": 0.0, "grad_norm": 0.26682841777801514, "kl": 0.03266127407550812, "learning_rate": 4.0400000000000006e-05, "loss": 0.0, "num_tokens": 3242186.0, "reward": 0.699999988079071, "reward_std": 0.3464101552963257, "rewards/wrapped_reward_func/mean": 0.699999988079071, "rewards/wrapped_reward_func/std": 0.3464101552963257, "step": 405 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07368421052631578, "frac_reward_zero_std": 0.0, "grad_norm": 0.3306557536125183, "kl": 0.04335389845073223, "learning_rate": 4.05e-05, "loss": 0.0, "num_tokens": 3256926.0, "reward": 0.699999988079071, "reward_std": 0.3464101552963257, "rewards/wrapped_reward_func/mean": 0.699999988079071, "rewards/wrapped_reward_func/std": 0.3464101552963257, "step": 406 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07386569872958258, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010206993902102113, "kl": 0.04111306928098202, "learning_rate": 4.0600000000000004e-05, "loss": 0.0, "num_tokens": 3262478.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 407 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07404718693284937, "frac_reward_zero_std": 1.0, "grad_norm": 0.006345921661704779, "kl": 0.0634209904819727, "learning_rate": 4.07e-05, "loss": 0.0001, "num_tokens": 3268030.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 408 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07422867513611615, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015566650545224547, "kl": 0.03769722580909729, "learning_rate": 4.08e-05, "loss": 0.0, "num_tokens": 3271334.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 409 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07441016333938294, "frac_reward_zero_std": 1.0, "grad_norm": 0.002224185736849904, "kl": 0.05324155278503895, "learning_rate": 4.09e-05, "loss": 0.0001, "num_tokens": 3274638.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 410 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07459165154264973, "frac_reward_zero_std": 1.0, "grad_norm": 0.001839076285250485, "kl": 0.0380413681268692, "learning_rate": 4.1e-05, "loss": 0.0, "num_tokens": 3280638.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 411 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07477313974591651, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007437526364810765, "kl": 0.028562751598656178, "learning_rate": 4.11e-05, "loss": 0.0, "num_tokens": 3286638.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 412 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0749546279491833, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028859616722911596, "kl": 0.0628531388938427, "learning_rate": 4.12e-05, "loss": 0.0001, "num_tokens": 3294698.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 413 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0751361161524501, "frac_reward_zero_std": 1.0, "grad_norm": 0.007533976808190346, "kl": 0.056039225310087204, "learning_rate": 4.13e-05, "loss": 0.0001, "num_tokens": 3302758.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 414 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07531760435571688, "frac_reward_zero_std": 1.0, "grad_norm": 0.002472235821187496, "kl": 0.06488249637186527, "learning_rate": 4.14e-05, "loss": 0.0001, "num_tokens": 3312150.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 415 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07549909255898367, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021658989135175943, "kl": 0.0722002424299717, "learning_rate": 4.15e-05, "loss": 0.0001, "num_tokens": 3321542.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 416 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07568058076225045, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014869084116071463, "kl": 0.048992061987519264, "learning_rate": 4.16e-05, "loss": 0.0, "num_tokens": 3326990.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 417 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07586206896551724, "frac_reward_zero_std": 1.0, "grad_norm": 0.000997894792817533, "kl": 0.04129803366959095, "learning_rate": 4.17e-05, "loss": 0.0, "num_tokens": 3332438.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 418 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07604355716878403, "frac_reward_zero_std": 1.0, "grad_norm": 0.004413351882249117, "kl": 0.0634375810623169, "learning_rate": 4.18e-05, "loss": 0.0001, "num_tokens": 3347354.0, "reward": 0.699999988079071, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.699999988079071, "rewards/wrapped_reward_func/std": 0.0, "step": 419 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07622504537205081, "frac_reward_zero_std": 0.0, "grad_norm": 0.09845826029777527, "kl": 0.085184745490551, "learning_rate": 4.19e-05, "loss": 0.0001, "num_tokens": 3362270.0, "reward": 0.75, "reward_std": 0.09999998658895493, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.10000000149011612, "step": 420 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0764065335753176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027156234718859196, "kl": 0.07056407630443573, "learning_rate": 4.2e-05, "loss": 0.0001, "num_tokens": 3370370.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 421 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0765880217785844, "frac_reward_zero_std": 1.0, "grad_norm": 0.003588775871321559, "kl": 0.08293386548757553, "learning_rate": 4.21e-05, "loss": 0.0001, "num_tokens": 3378470.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 422 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07676950998185118, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018374716164544225, "kl": 0.05231049656867981, "learning_rate": 4.22e-05, "loss": 0.0001, "num_tokens": 3383990.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 423 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07695099818511797, "frac_reward_zero_std": 1.0, "grad_norm": 0.019222727045416832, "kl": 0.14174048975110054, "learning_rate": 4.23e-05, "loss": 0.0001, "num_tokens": 3389510.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 424 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07713248638838476, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029731402173638344, "kl": 0.05683453939855099, "learning_rate": 4.24e-05, "loss": 0.0001, "num_tokens": 3394906.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 425 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07731397459165154, "frac_reward_zero_std": 1.0, "grad_norm": 0.01098526082932949, "kl": 0.08893289417028427, "learning_rate": 4.25e-05, "loss": 0.0001, "num_tokens": 3400302.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 426 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07749546279491833, "frac_reward_zero_std": 1.0, "grad_norm": 0.006874803453683853, "kl": 0.06799864396452904, "learning_rate": 4.26e-05, "loss": 0.0001, "num_tokens": 3408610.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 427 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07767695099818511, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007751657394692302, "kl": 0.03679514303803444, "learning_rate": 4.27e-05, "loss": 0.0, "num_tokens": 3416918.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 428 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07785843920145191, "frac_reward_zero_std": 1.0, "grad_norm": 0.001063210773281753, "kl": 0.039396824315190315, "learning_rate": 4.2800000000000004e-05, "loss": 0.0, "num_tokens": 3429990.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 429 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0780399274047187, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008668023510836065, "kl": 0.03405775781720877, "learning_rate": 4.29e-05, "loss": 0.0, "num_tokens": 3443062.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 430 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07822141560798548, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006809435435570776, "kl": 0.03419783525168896, "learning_rate": 4.3e-05, "loss": 0.0, "num_tokens": 3454982.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 431 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07840290381125227, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007461539935320616, "kl": 0.0308984387665987, "learning_rate": 4.3100000000000004e-05, "loss": 0.0, "num_tokens": 3466902.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 432 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07858439201451906, "frac_reward_zero_std": 0.0, "grad_norm": 0.033061541616916656, "kl": 0.02655835822224617, "learning_rate": 4.32e-05, "loss": 0.0, "num_tokens": 3472490.0, "reward": 0.6499999761581421, "reward_std": 0.057735007256269455, "rewards/wrapped_reward_func/mean": 0.6499999761581421, "rewards/wrapped_reward_func/std": 0.057735007256269455, "step": 433 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07876588021778584, "frac_reward_zero_std": 0.0, "grad_norm": 0.027020826935768127, "kl": 0.03454662673175335, "learning_rate": 4.33e-05, "loss": 0.0, "num_tokens": 3478078.0, "reward": 0.6749999523162842, "reward_std": 0.04999999329447746, "rewards/wrapped_reward_func/mean": 0.6749999523162842, "rewards/wrapped_reward_func/std": 0.04999998211860657, "step": 434 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07894736842105263, "frac_reward_zero_std": 0.0, "grad_norm": 0.13184340298175812, "kl": 0.08227598294615746, "learning_rate": 4.3400000000000005e-05, "loss": 0.0001, "num_tokens": 3485574.0, "reward": 0.925000011920929, "reward_std": 0.14999999105930328, "rewards/wrapped_reward_func/mean": 0.925000011920929, "rewards/wrapped_reward_func/std": 0.15000002086162567, "step": 435 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07912885662431941, "frac_reward_zero_std": 1.0, "grad_norm": 0.00875530019402504, "kl": 0.085557721555233, "learning_rate": 4.35e-05, "loss": 0.0001, "num_tokens": 3493070.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 436 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07931034482758621, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034837243147194386, "kl": 0.044360823929309845, "learning_rate": 4.36e-05, "loss": 0.0, "num_tokens": 3499778.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 437 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.079491833030853, "frac_reward_zero_std": 1.0, "grad_norm": 0.0042328438721597195, "kl": 0.052434155717492104, "learning_rate": 4.3700000000000005e-05, "loss": 0.0001, "num_tokens": 3506486.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 438 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07967332123411978, "frac_reward_zero_std": 1.0, "grad_norm": 0.00148865079972893, "kl": 0.04273734427988529, "learning_rate": 4.38e-05, "loss": 0.0, "num_tokens": 3513050.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 439 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.07985480943738657, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007939567440189421, "kl": 0.03262108191847801, "learning_rate": 4.39e-05, "loss": 0.0, "num_tokens": 3519614.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 440 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08003629764065336, "frac_reward_zero_std": 0.0, "grad_norm": 0.41448476910591125, "kl": 0.05551823228597641, "learning_rate": 4.4000000000000006e-05, "loss": 0.0001, "num_tokens": 3528794.0, "reward": 0.75, "reward_std": 0.5, "rewards/wrapped_reward_func/mean": 0.75, "rewards/wrapped_reward_func/std": 0.5, "step": 441 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08021778584392014, "frac_reward_zero_std": 0.0, "grad_norm": 0.10069494694471359, "kl": 0.08132223412394524, "learning_rate": 4.41e-05, "loss": 0.0001, "num_tokens": 3537974.0, "reward": 0.925000011920929, "reward_std": 0.14999999105930328, "rewards/wrapped_reward_func/mean": 0.925000011920929, "rewards/wrapped_reward_func/std": 0.15000002086162567, "step": 442 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08039927404718693, "frac_reward_zero_std": 1.0, "grad_norm": 0.001090483390726149, "kl": 0.03579960856586695, "learning_rate": 4.4200000000000004e-05, "loss": 0.0, "num_tokens": 3541258.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 443 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08058076225045373, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013164884876459837, "kl": 0.05526914540678263, "learning_rate": 4.43e-05, "loss": 0.0001, "num_tokens": 3544542.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 444 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08076225045372051, "frac_reward_zero_std": 1.0, "grad_norm": 0.004198664333671331, "kl": 0.06548885069787502, "learning_rate": 4.44e-05, "loss": 0.0001, "num_tokens": 3548758.0, "reward": 0.699999988079071, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.699999988079071, "rewards/wrapped_reward_func/std": 0.0, "step": 445 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0809437386569873, "frac_reward_zero_std": 1.0, "grad_norm": 0.0058217584155499935, "kl": 0.07073243521153927, "learning_rate": 4.4500000000000004e-05, "loss": 0.0001, "num_tokens": 3552974.0, "reward": 0.699999988079071, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.699999988079071, "rewards/wrapped_reward_func/std": 0.0, "step": 446 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08112522686025408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019332718802616, "kl": 0.057408466935157776, "learning_rate": 4.46e-05, "loss": 0.0001, "num_tokens": 3558642.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 447 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08130671506352087, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028136761393398046, "kl": 0.055097391828894615, "learning_rate": 4.47e-05, "loss": 0.0001, "num_tokens": 3564310.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 448 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08148820326678766, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011132995132356882, "kl": 0.07314513437449932, "learning_rate": 4.4800000000000005e-05, "loss": 0.0001, "num_tokens": 3570454.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 449 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08166969147005444, "frac_reward_zero_std": 1.0, "grad_norm": 0.000880289648193866, "kl": 0.06384582445025444, "learning_rate": 4.49e-05, "loss": 0.0001, "num_tokens": 3576598.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 450 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08185117967332123, "frac_reward_zero_std": 1.0, "grad_norm": 0.004129377193748951, "kl": 0.05217841453850269, "learning_rate": 4.5e-05, "loss": 0.0001, "num_tokens": 3582870.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 451 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08203266787658803, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010447067907080054, "kl": 0.04082373343408108, "learning_rate": 4.5100000000000005e-05, "loss": 0.0, "num_tokens": 3589142.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 452 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08221415607985481, "frac_reward_zero_std": 0.0, "grad_norm": 0.12262605875730515, "kl": 0.08214755542576313, "learning_rate": 4.52e-05, "loss": 0.0001, "num_tokens": 3612326.0, "reward": 0.925000011920929, "reward_std": 0.14999999105930328, "rewards/wrapped_reward_func/mean": 0.925000011920929, "rewards/wrapped_reward_func/std": 0.15000002086162567, "step": 453 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0823956442831216, "frac_reward_zero_std": 0.0, "grad_norm": 0.12493463605642319, "kl": 0.08801200613379478, "learning_rate": 4.53e-05, "loss": 0.0001, "num_tokens": 3635510.0, "reward": 0.8999999761581421, "reward_std": 0.1414213478565216, "rewards/wrapped_reward_func/mean": 0.8999999761581421, "rewards/wrapped_reward_func/std": 0.1414213627576828, "step": 454 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08257713248638839, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018088078359141946, "kl": 0.07587864995002747, "learning_rate": 4.5400000000000006e-05, "loss": 0.0001, "num_tokens": 3641150.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 455 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08275862068965517, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017802476650103927, "kl": 0.07255147397518158, "learning_rate": 4.55e-05, "loss": 0.0001, "num_tokens": 3646790.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 456 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08294010889292196, "frac_reward_zero_std": 1.0, "grad_norm": 0.004353760275989771, "kl": 0.08423202112317085, "learning_rate": 4.5600000000000004e-05, "loss": 0.0001, "num_tokens": 3652370.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 457 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08312159709618874, "frac_reward_zero_std": 1.0, "grad_norm": 0.001349235768429935, "kl": 0.07632169872522354, "learning_rate": 4.5700000000000006e-05, "loss": 0.0001, "num_tokens": 3657950.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 458 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08330308529945553, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020431512966752052, "kl": 0.0961260125041008, "learning_rate": 4.58e-05, "loss": 0.0001, "num_tokens": 3661330.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 459 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08348457350272233, "frac_reward_zero_std": 1.0, "grad_norm": 0.0071617187932133675, "kl": 0.15512722730636597, "learning_rate": 4.5900000000000004e-05, "loss": 0.0002, "num_tokens": 3664710.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 460 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08366606170598911, "frac_reward_zero_std": 0.0, "grad_norm": 0.1616966426372528, "kl": 0.1347179301083088, "learning_rate": 4.600000000000001e-05, "loss": 0.0001, "num_tokens": 3669774.0, "reward": 0.7749999761581421, "reward_std": 0.15000000596046448, "rewards/wrapped_reward_func/mean": 0.7749999761581421, "rewards/wrapped_reward_func/std": 0.15000002086162567, "step": 461 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0838475499092559, "frac_reward_zero_std": 0.0, "grad_norm": 0.12277400493621826, "kl": 0.0761793777346611, "learning_rate": 4.61e-05, "loss": 0.0001, "num_tokens": 3674838.0, "reward": 0.7749999761581421, "reward_std": 0.15000000596046448, "rewards/wrapped_reward_func/mean": 0.7749999761581421, "rewards/wrapped_reward_func/std": 0.15000002086162567, "step": 462 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08402903811252269, "frac_reward_zero_std": 1.0, "grad_norm": 0.005020746961236, "kl": 0.10882649198174477, "learning_rate": 4.6200000000000005e-05, "loss": 0.0001, "num_tokens": 3678890.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 463 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08421052631578947, "frac_reward_zero_std": 0.0, "grad_norm": 0.1190323606133461, "kl": 0.0950850211083889, "learning_rate": 4.630000000000001e-05, "loss": 0.0001, "num_tokens": 3682942.0, "reward": 0.925000011920929, "reward_std": 0.14999999105930328, "rewards/wrapped_reward_func/mean": 0.925000011920929, "rewards/wrapped_reward_func/std": 0.15000002086162567, "step": 464 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08439201451905626, "frac_reward_zero_std": 1.0, "grad_norm": 0.00277586723677814, "kl": 0.07232636585831642, "learning_rate": 4.64e-05, "loss": 0.0001, "num_tokens": 3688294.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 465 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08457350272232304, "frac_reward_zero_std": 0.0, "grad_norm": 0.09676417708396912, "kl": 0.05655540153384209, "learning_rate": 4.6500000000000005e-05, "loss": 0.0001, "num_tokens": 3693646.0, "reward": 0.925000011920929, "reward_std": 0.14999999105930328, "rewards/wrapped_reward_func/mean": 0.925000011920929, "rewards/wrapped_reward_func/std": 0.15000002086162567, "step": 466 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08475499092558984, "frac_reward_zero_std": 1.0, "grad_norm": 0.002281506545841694, "kl": 0.08663711324334145, "learning_rate": 4.660000000000001e-05, "loss": 0.0001, "num_tokens": 3697646.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 467 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08493647912885663, "frac_reward_zero_std": 0.0, "grad_norm": 0.1443251520395279, "kl": 0.08085461892187595, "learning_rate": 4.6700000000000003e-05, "loss": 0.0001, "num_tokens": 3701646.0, "reward": 0.8999999761581421, "reward_std": 0.19999998807907104, "rewards/wrapped_reward_func/mean": 0.8999999761581421, "rewards/wrapped_reward_func/std": 0.20000000298023224, "step": 468 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08511796733212341, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018909935606643558, "kl": 0.06386374682188034, "learning_rate": 4.6800000000000006e-05, "loss": 0.0001, "num_tokens": 3709722.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 469 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0852994555353902, "frac_reward_zero_std": 1.0, "grad_norm": 0.004717918578535318, "kl": 0.09652997925877571, "learning_rate": 4.69e-05, "loss": 0.0001, "num_tokens": 3717798.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 470 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08548094373865699, "frac_reward_zero_std": 0.0, "grad_norm": 0.14052796363830566, "kl": 0.09348682686686516, "learning_rate": 4.7e-05, "loss": 0.0001, "num_tokens": 3723262.0, "reward": 0.25, "reward_std": 0.17320507764816284, "rewards/wrapped_reward_func/mean": 0.25, "rewards/wrapped_reward_func/std": 0.17320507764816284, "step": 471 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08566243194192377, "frac_reward_zero_std": 0.0, "grad_norm": 0.11638838052749634, "kl": 0.1438814327120781, "learning_rate": 4.71e-05, "loss": 0.0001, "num_tokens": 3728726.0, "reward": 0.32499998807907104, "reward_std": 0.15000000596046448, "rewards/wrapped_reward_func/mean": 0.32499998807907104, "rewards/wrapped_reward_func/std": 0.15000000596046448, "step": 472 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08584392014519056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022833351977169514, "kl": 0.08705046400427818, "learning_rate": 4.72e-05, "loss": 0.0001, "num_tokens": 3743402.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 473 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08602540834845734, "frac_reward_zero_std": 1.0, "grad_norm": 0.01584709994494915, "kl": 0.1560915783047676, "learning_rate": 4.73e-05, "loss": 0.0002, "num_tokens": 3758078.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 474 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08620689655172414, "frac_reward_zero_std": 1.0, "grad_norm": 0.00465814583003521, "kl": 0.09656030312180519, "learning_rate": 4.74e-05, "loss": 0.0001, "num_tokens": 3766902.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 475 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08638838475499093, "frac_reward_zero_std": 1.0, "grad_norm": 0.001156590529717505, "kl": 0.09974214434623718, "learning_rate": 4.75e-05, "loss": 0.0001, "num_tokens": 3775726.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 476 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08656987295825772, "frac_reward_zero_std": 1.0, "grad_norm": 0.001268749125301838, "kl": 0.05907566100358963, "learning_rate": 4.76e-05, "loss": 0.0001, "num_tokens": 3783746.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 477 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0867513611615245, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015953707043081522, "kl": 0.06348135694861412, "learning_rate": 4.77e-05, "loss": 0.0001, "num_tokens": 3791766.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 478 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08693284936479129, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034455338027328253, "kl": 0.09974687173962593, "learning_rate": 4.78e-05, "loss": 0.0001, "num_tokens": 3795162.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 479 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08711433756805807, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031118306796997786, "kl": 0.12577338889241219, "learning_rate": 4.79e-05, "loss": 0.0001, "num_tokens": 3798558.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 480 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08729582577132486, "frac_reward_zero_std": 1.0, "grad_norm": 0.003717184765264392, "kl": 0.11217866092920303, "learning_rate": 4.8e-05, "loss": 0.0001, "num_tokens": 3801934.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 481 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08747731397459166, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027835220098495483, "kl": 0.11266686394810677, "learning_rate": 4.8100000000000004e-05, "loss": 0.0001, "num_tokens": 3805310.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 482 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08765880217785844, "frac_reward_zero_std": 1.0, "grad_norm": 0.003336426569148898, "kl": 0.08117913082242012, "learning_rate": 4.82e-05, "loss": 0.0001, "num_tokens": 3818994.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 483 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08784029038112523, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020799883641302586, "kl": 0.06354839727282524, "learning_rate": 4.83e-05, "loss": 0.0001, "num_tokens": 3832678.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 484 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08802177858439202, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015165829099714756, "kl": 0.07485831901431084, "learning_rate": 4.8400000000000004e-05, "loss": 0.0001, "num_tokens": 3846650.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 485 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0882032667876588, "frac_reward_zero_std": 1.0, "grad_norm": 0.001443956745788455, "kl": 0.07588879764080048, "learning_rate": 4.85e-05, "loss": 0.0001, "num_tokens": 3860622.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 486 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08838475499092559, "frac_reward_zero_std": 1.0, "grad_norm": 0.002107418840751052, "kl": 0.07573697715997696, "learning_rate": 4.86e-05, "loss": 0.0001, "num_tokens": 3866166.0, "reward": 0.4000000059604645, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.4000000059604645, "rewards/wrapped_reward_func/std": 0.0, "step": 487 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08856624319419237, "frac_reward_zero_std": 1.0, "grad_norm": 0.005498294718563557, "kl": 0.11692870035767555, "learning_rate": 4.87e-05, "loss": 0.0001, "num_tokens": 3871710.0, "reward": 0.4000000059604645, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.4000000059604645, "rewards/wrapped_reward_func/std": 0.0, "step": 488 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08874773139745916, "frac_reward_zero_std": 1.0, "grad_norm": 0.001958824461326003, "kl": 0.09437486529350281, "learning_rate": 4.88e-05, "loss": 0.0001, "num_tokens": 3882934.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 489 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08892921960072596, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029206625185906887, "kl": 0.08734860271215439, "learning_rate": 4.89e-05, "loss": 0.0001, "num_tokens": 3894158.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 490 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08911070780399274, "frac_reward_zero_std": 0.0, "grad_norm": 0.0388951450586319, "kl": 0.06985681131482124, "learning_rate": 4.9e-05, "loss": 0.0001, "num_tokens": 3900522.0, "reward": 0.05000000074505806, "reward_std": 0.057735029608011246, "rewards/wrapped_reward_func/mean": 0.05000000074505806, "rewards/wrapped_reward_func/std": 0.057735029608011246, "step": 491 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08929219600725953, "frac_reward_zero_std": 0.0, "grad_norm": 0.027583617717027664, "kl": 0.07229587808251381, "learning_rate": 4.91e-05, "loss": 0.0001, "num_tokens": 3906886.0, "reward": 0.07500000298023224, "reward_std": 0.05000000074505806, "rewards/wrapped_reward_func/mean": 0.07500000298023224, "rewards/wrapped_reward_func/std": 0.05000000074505806, "step": 492 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08947368421052632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017342005157843232, "kl": 0.06251736171543598, "learning_rate": 4.92e-05, "loss": 0.0001, "num_tokens": 3913486.0, "reward": 0.4000000059604645, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.4000000059604645, "rewards/wrapped_reward_func/std": 0.0, "step": 493 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0896551724137931, "frac_reward_zero_std": 1.0, "grad_norm": 0.001041323528625071, "kl": 0.0786086954176426, "learning_rate": 4.93e-05, "loss": 0.0001, "num_tokens": 3920086.0, "reward": 0.4000000059604645, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.4000000059604645, "rewards/wrapped_reward_func/std": 0.0, "step": 494 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.08983666061705989, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025633852928876877, "kl": 0.06872519105672836, "learning_rate": 4.94e-05, "loss": 0.0001, "num_tokens": 3924298.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 495 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.09001814882032667, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012338105589151382, "kl": 0.06796863675117493, "learning_rate": 4.9500000000000004e-05, "loss": 0.0001, "num_tokens": 3928510.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 496 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.09019963702359346, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012765313731506467, "kl": 0.04850449226796627, "learning_rate": 4.96e-05, "loss": 0.0, "num_tokens": 3934642.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 497 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.09038112522686026, "frac_reward_zero_std": 1.0, "grad_norm": 0.003003024496138096, "kl": 0.08406319841742516, "learning_rate": 4.97e-05, "loss": 0.0001, "num_tokens": 3940774.0, "reward": 1.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 1.0, "rewards/wrapped_reward_func/std": 0.0, "step": 498 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.09056261343012705, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023521126713603735, "kl": 0.14165138453245163, "learning_rate": 4.9800000000000004e-05, "loss": 0.0001, "num_tokens": 3944362.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 499 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.09074410163339383, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021522147580981255, "kl": 0.09003083780407906, "learning_rate": 4.99e-05, "loss": 0.0001, "num_tokens": 3947950.0, "reward": 0.0, "reward_std": 0.0, "rewards/wrapped_reward_func/mean": 0.0, "rewards/wrapped_reward_func/std": 0.0, "step": 500 } ], "logging_steps": 1, "max_steps": 5000, "num_input_tokens_seen": 3947950, "num_train_epochs": 1, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }