| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.030712530712530713, |
| "eval_steps": 500, |
| "global_step": 200, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.00015356265356265356, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 0.0, |
| "loss": 0.0, |
| "num_tokens": 2528.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 1 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.0003071253071253071, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22934769093990326, |
| "kl": 0.0, |
| "learning_rate": 5.0000000000000004e-08, |
| "loss": 0.0, |
| "num_tokens": 5056.0, |
| "reward": 0.75, |
| "reward_std": 0.4629100561141968, |
| "rewards/compute_reward/mean": 0.75, |
| "rewards/compute_reward/std": 0.4629100561141968, |
| "step": 2 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.0004606879606879607, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 2.4884398953872733e-05, |
| "kl": 0.0011284813517704606, |
| "learning_rate": 1.0000000000000001e-07, |
| "loss": 0.0, |
| "num_tokens": 7584.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 3 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.0006142506142506142, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 5.449208038044162e-05, |
| "kl": 0.0020245485939085484, |
| "learning_rate": 1.5000000000000002e-07, |
| "loss": 0.0, |
| "num_tokens": 10112.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 4 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.0007678132678132678, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.28413450717926025, |
| "kl": 0.0017015498597174883, |
| "learning_rate": 2.0000000000000002e-07, |
| "loss": 0.0, |
| "num_tokens": 12640.0, |
| "reward": 0.75, |
| "reward_std": 0.4629100561141968, |
| "rewards/compute_reward/mean": 0.75, |
| "rewards/compute_reward/std": 0.4629100561141968, |
| "step": 5 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.0009213759213759214, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.5294800656847656e-05, |
| "kl": 0.001824827864766121, |
| "learning_rate": 2.5000000000000004e-07, |
| "loss": 0.0, |
| "num_tokens": 15168.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 6 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.001074938574938575, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22941677272319794, |
| "kl": 0.001545972190797329, |
| "learning_rate": 3.0000000000000004e-07, |
| "loss": 0.0, |
| "num_tokens": 17696.0, |
| "reward": 0.125, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.125, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 7 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.0012285012285012285, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24631166458129883, |
| "kl": 0.0017354765441268682, |
| "learning_rate": 3.5000000000000004e-07, |
| "loss": 0.0, |
| "num_tokens": 20224.0, |
| "reward": 0.875, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.875, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 8 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.001382063882063882, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.0858907595975325e-05, |
| "kl": 0.0018943097675219178, |
| "learning_rate": 4.0000000000000003e-07, |
| "loss": 0.0, |
| "num_tokens": 22752.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 9 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.0015356265356265355, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.00010698483674786985, |
| "kl": 0.0019578838255256414, |
| "learning_rate": 4.5000000000000003e-07, |
| "loss": 0.0, |
| "num_tokens": 25280.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 10 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.0016891891891891893, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 3.508454028633423e-05, |
| "kl": 0.001726516056805849, |
| "learning_rate": 5.000000000000001e-07, |
| "loss": 0.0, |
| "num_tokens": 27808.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 11 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.0018427518427518428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26142945885658264, |
| "kl": 0.001936572720296681, |
| "learning_rate": 5.5e-07, |
| "loss": 0.0, |
| "num_tokens": 30336.0, |
| "reward": 0.5, |
| "reward_std": 0.5345224738121033, |
| "rewards/compute_reward/mean": 0.5, |
| "rewards/compute_reward/std": 0.5345224738121033, |
| "step": 12 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.0019963144963144963, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 3.853518865071237e-05, |
| "kl": 0.001674045342952013, |
| "learning_rate": 6.000000000000001e-07, |
| "loss": 0.0, |
| "num_tokens": 32864.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 13 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.00214987714987715, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 5.5004267778713256e-05, |
| "kl": 0.0016993756871670485, |
| "learning_rate": 6.5e-07, |
| "loss": 0.0, |
| "num_tokens": 35392.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 14 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.0023034398034398034, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.000111581408418715, |
| "kl": 0.0021551968529820442, |
| "learning_rate": 7.000000000000001e-07, |
| "loss": 0.0, |
| "num_tokens": 37920.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 15 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.002457002457002457, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2612421214580536, |
| "kl": 0.0017377862241119146, |
| "learning_rate": 7.5e-07, |
| "loss": 0.0, |
| "num_tokens": 40448.0, |
| "reward": 0.25, |
| "reward_std": 0.4629100561141968, |
| "rewards/compute_reward/mean": 0.25, |
| "rewards/compute_reward/std": 0.4629100561141968, |
| "step": 16 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.0026105651105651105, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.1293584217783064e-05, |
| "kl": 0.00191373226698488, |
| "learning_rate": 8.000000000000001e-07, |
| "loss": 0.0, |
| "num_tokens": 42976.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 17 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.002764127764127764, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.00013062957441434264, |
| "kl": 0.0017311959527432919, |
| "learning_rate": 8.500000000000001e-07, |
| "loss": 0.0, |
| "num_tokens": 45504.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 18 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.0029176904176904175, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.6584205847466365e-05, |
| "kl": 0.001798063050955534, |
| "learning_rate": 9.000000000000001e-07, |
| "loss": 0.0, |
| "num_tokens": 48032.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 19 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.003071253071253071, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.3272721086395904e-05, |
| "kl": 0.0018202185165137053, |
| "learning_rate": 9.500000000000001e-07, |
| "loss": 0.0, |
| "num_tokens": 50560.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 20 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.0032248157248157246, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 7.859696779632941e-05, |
| "kl": 0.0018265624530613422, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 0.0, |
| "num_tokens": 53088.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 21 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.0033783783783783786, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26943519711494446, |
| "kl": 0.0018876121612265706, |
| "learning_rate": 1.0500000000000001e-06, |
| "loss": 0.0, |
| "num_tokens": 55616.0, |
| "reward": 0.25, |
| "reward_std": 0.4629100561141968, |
| "rewards/compute_reward/mean": 0.25, |
| "rewards/compute_reward/std": 0.4629100561141968, |
| "step": 22 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.003531941031941032, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.203104630112648, |
| "kl": 0.0015638768672943115, |
| "learning_rate": 1.1e-06, |
| "loss": 0.0, |
| "num_tokens": 58144.0, |
| "reward": 0.125, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.125, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 23 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.0036855036855036856, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.421776611707173e-05, |
| "kl": 0.0017870652955025434, |
| "learning_rate": 1.1500000000000002e-06, |
| "loss": 0.0, |
| "num_tokens": 60672.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 24 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.003839066339066339, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26979029178619385, |
| "kl": 0.001943537499755621, |
| "learning_rate": 1.2000000000000002e-06, |
| "loss": 0.0, |
| "num_tokens": 63200.0, |
| "reward": 0.875, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.875, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 25 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.003992628992628993, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.27013102173805237, |
| "kl": 0.001535331830382347, |
| "learning_rate": 1.25e-06, |
| "loss": 0.0, |
| "num_tokens": 65728.0, |
| "reward": 0.125, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.125, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 26 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.004146191646191646, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 3.67719039786607e-05, |
| "kl": 0.0017323686042800546, |
| "learning_rate": 1.3e-06, |
| "loss": 0.0, |
| "num_tokens": 68256.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 27 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.0042997542997543, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23535043001174927, |
| "kl": 0.0020425482653081417, |
| "learning_rate": 1.3500000000000002e-06, |
| "loss": 0.0, |
| "num_tokens": 70784.0, |
| "reward": 0.125, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.125, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 28 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.004453316953316954, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 3.760838080779649e-05, |
| "kl": 0.0016436483711004257, |
| "learning_rate": 1.4000000000000001e-06, |
| "loss": 0.0, |
| "num_tokens": 73312.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 29 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.004606879606879607, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25959762930870056, |
| "kl": 0.0017603226006031036, |
| "learning_rate": 1.45e-06, |
| "loss": 0.0, |
| "num_tokens": 75840.0, |
| "reward": 0.375, |
| "reward_std": 0.5175491571426392, |
| "rewards/compute_reward/mean": 0.375, |
| "rewards/compute_reward/std": 0.5175492167472839, |
| "step": 30 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.004760442260442261, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23616784811019897, |
| "kl": 0.0018221420468762517, |
| "learning_rate": 1.5e-06, |
| "loss": 0.0, |
| "num_tokens": 78368.0, |
| "reward": 0.125, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.125, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 31 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.004914004914004914, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.308336792746559e-05, |
| "kl": 0.001862606150098145, |
| "learning_rate": 1.5500000000000002e-06, |
| "loss": 0.0, |
| "num_tokens": 80896.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 32 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.005067567567567568, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.589068339555524e-05, |
| "kl": 0.001723211957141757, |
| "learning_rate": 1.6000000000000001e-06, |
| "loss": 0.0, |
| "num_tokens": 83424.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 33 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.005221130221130221, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.30930790305137634, |
| "kl": 0.001744288601912558, |
| "learning_rate": 1.6500000000000003e-06, |
| "loss": 0.0, |
| "num_tokens": 85952.0, |
| "reward": 0.25, |
| "reward_std": 0.4629100561141968, |
| "rewards/compute_reward/mean": 0.25, |
| "rewards/compute_reward/std": 0.4629100561141968, |
| "step": 34 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.005374692874692875, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 5.498694008565508e-05, |
| "kl": 0.001765427179634571, |
| "learning_rate": 1.7000000000000002e-06, |
| "loss": 0.0, |
| "num_tokens": 88480.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 35 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.005528255528255528, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.45219364762306213, |
| "kl": 0.0013709957711398602, |
| "learning_rate": 1.75e-06, |
| "loss": 0.0, |
| "num_tokens": 91008.0, |
| "reward": 0.875, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.875, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 36 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.005681818181818182, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 5.568007691181265e-05, |
| "kl": 0.0015674353344365954, |
| "learning_rate": 1.8000000000000001e-06, |
| "loss": 0.0, |
| "num_tokens": 93536.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 37 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.005835380835380835, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.00022698717657476664, |
| "kl": 0.002098341705277562, |
| "learning_rate": 1.85e-06, |
| "loss": 0.0, |
| "num_tokens": 96064.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 38 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.005988943488943489, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.615697616827674e-05, |
| "kl": 0.0018569512758404016, |
| "learning_rate": 1.9000000000000002e-06, |
| "loss": 0.0, |
| "num_tokens": 98592.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 39 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.006142506142506142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3124924898147583, |
| "kl": 0.002205869182944298, |
| "learning_rate": 1.9500000000000004e-06, |
| "loss": 0.0, |
| "num_tokens": 101120.0, |
| "reward": 0.75, |
| "reward_std": 0.4629100561141968, |
| "rewards/compute_reward/mean": 0.75, |
| "rewards/compute_reward/std": 0.4629100561141968, |
| "step": 40 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.006296068796068796, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.875810554949567e-05, |
| "kl": 0.001907296129502356, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 0.0, |
| "num_tokens": 103648.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 41 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.006449631449631449, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.659927071770653e-05, |
| "kl": 0.0018478649435564876, |
| "learning_rate": 2.05e-06, |
| "loss": 0.0, |
| "num_tokens": 106176.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 42 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.006603194103194103, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 8.290458208648488e-05, |
| "kl": 0.00224427436478436, |
| "learning_rate": 2.1000000000000002e-06, |
| "loss": 0.0, |
| "num_tokens": 108704.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 43 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.006756756756756757, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 5.3578805818688124e-05, |
| "kl": 0.0018698148196563125, |
| "learning_rate": 2.15e-06, |
| "loss": 0.0, |
| "num_tokens": 111232.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 44 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.00691031941031941, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.0759008697932586e-05, |
| "kl": 0.0018630733247846365, |
| "learning_rate": 2.2e-06, |
| "loss": 0.0, |
| "num_tokens": 113760.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 45 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.007063882063882064, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.1463466914137825e-05, |
| "kl": 0.0014028309378772974, |
| "learning_rate": 2.25e-06, |
| "loss": 0.0, |
| "num_tokens": 116288.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 46 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.007217444717444717, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 5.521080311154947e-05, |
| "kl": 0.0018875582609325647, |
| "learning_rate": 2.3000000000000004e-06, |
| "loss": 0.0, |
| "num_tokens": 118816.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 47 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.007371007371007371, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.27408236265182495, |
| "kl": 0.002105266787111759, |
| "learning_rate": 2.35e-06, |
| "loss": 0.0, |
| "num_tokens": 121344.0, |
| "reward": 0.25, |
| "reward_std": 0.4629100561141968, |
| "rewards/compute_reward/mean": 0.25, |
| "rewards/compute_reward/std": 0.4629100561141968, |
| "step": 48 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.007524570024570024, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.00027501373551785946, |
| "kl": 0.0023503885604441166, |
| "learning_rate": 2.4000000000000003e-06, |
| "loss": 0.0, |
| "num_tokens": 123872.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 49 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.007678132678132678, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.1167542804032564e-05, |
| "kl": 0.0015902362065389752, |
| "learning_rate": 2.4500000000000003e-06, |
| "loss": 0.0, |
| "num_tokens": 126400.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 50 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.007831695331695332, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 6.368236790876836e-05, |
| "kl": 0.0016205956926569343, |
| "learning_rate": 2.5e-06, |
| "loss": 0.0, |
| "num_tokens": 128928.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 51 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.007985257985257985, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2795107364654541, |
| "kl": 0.0019439270254224539, |
| "learning_rate": 2.55e-06, |
| "loss": 0.0, |
| "num_tokens": 131456.0, |
| "reward": 0.5, |
| "reward_std": 0.5345224738121033, |
| "rewards/compute_reward/mean": 0.5, |
| "rewards/compute_reward/std": 0.5345224738121033, |
| "step": 52 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.008138820638820638, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2855827510356903, |
| "kl": 0.0019104867242276669, |
| "learning_rate": 2.6e-06, |
| "loss": 0.0, |
| "num_tokens": 133984.0, |
| "reward": 0.25, |
| "reward_std": 0.4629100561141968, |
| "rewards/compute_reward/mean": 0.25, |
| "rewards/compute_reward/std": 0.4629100561141968, |
| "step": 53 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.008292383292383292, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 6.060176747268997e-05, |
| "kl": 0.0017747373785823584, |
| "learning_rate": 2.6500000000000005e-06, |
| "loss": 0.0, |
| "num_tokens": 136512.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 54 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.008445945945945946, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.28289034962654114, |
| "kl": 0.0022900549229234457, |
| "learning_rate": 2.7000000000000004e-06, |
| "loss": 0.0, |
| "num_tokens": 139040.0, |
| "reward": 0.625, |
| "reward_std": 0.5175491571426392, |
| "rewards/compute_reward/mean": 0.625, |
| "rewards/compute_reward/std": 0.5175492167472839, |
| "step": 55 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.0085995085995086, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.690838613896631e-05, |
| "kl": 0.001854082802310586, |
| "learning_rate": 2.7500000000000004e-06, |
| "loss": 0.0, |
| "num_tokens": 141568.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 56 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.008753071253071253, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.404248102218844e-05, |
| "kl": 0.0015642520738765597, |
| "learning_rate": 2.8000000000000003e-06, |
| "loss": 0.0, |
| "num_tokens": 144096.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 57 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.008906633906633907, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2607027590274811, |
| "kl": 0.001625827164389193, |
| "learning_rate": 2.85e-06, |
| "loss": 0.0, |
| "num_tokens": 146624.0, |
| "reward": 0.875, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.875, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 58 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.00906019656019656, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.736987830256112e-05, |
| "kl": 0.0018028158228844404, |
| "learning_rate": 2.9e-06, |
| "loss": 0.0, |
| "num_tokens": 149152.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 59 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.009213759213759214, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2748159170150757, |
| "kl": 0.0017374631715938449, |
| "learning_rate": 2.95e-06, |
| "loss": 0.0, |
| "num_tokens": 151680.0, |
| "reward": 0.125, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.125, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 60 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.009367321867321867, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.3023454054491594e-05, |
| "kl": 0.00184379902202636, |
| "learning_rate": 3e-06, |
| "loss": 0.0, |
| "num_tokens": 154208.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 61 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.009520884520884522, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 3.636611290858127e-05, |
| "kl": 0.001865661353804171, |
| "learning_rate": 3.05e-06, |
| "loss": 0.0, |
| "num_tokens": 156736.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 62 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.009674447174447175, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.183187411399558e-05, |
| "kl": 0.0015882912557572126, |
| "learning_rate": 3.1000000000000004e-06, |
| "loss": 0.0, |
| "num_tokens": 159264.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 63 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.009828009828009828, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 3.482505780993961e-05, |
| "kl": 0.0016557248309254646, |
| "learning_rate": 3.1500000000000003e-06, |
| "loss": 0.0, |
| "num_tokens": 161792.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 64 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.00998157248157248, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.539267683867365e-05, |
| "kl": 0.0019870202522724867, |
| "learning_rate": 3.2000000000000003e-06, |
| "loss": 0.0, |
| "num_tokens": 164320.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 65 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.010135135135135136, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 6.449189822888002e-05, |
| "kl": 0.0018529919907450676, |
| "learning_rate": 3.2500000000000002e-06, |
| "loss": 0.0, |
| "num_tokens": 166848.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 66 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.010288697788697789, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.32904744148254395, |
| "kl": 0.001953460043296218, |
| "learning_rate": 3.3000000000000006e-06, |
| "loss": 0.0, |
| "num_tokens": 169376.0, |
| "reward": 0.5, |
| "reward_std": 0.5345224738121033, |
| "rewards/compute_reward/mean": 0.5, |
| "rewards/compute_reward/std": 0.5345224738121033, |
| "step": 67 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.010442260442260442, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.281531423330307, |
| "kl": 0.0020269351080060005, |
| "learning_rate": 3.3500000000000005e-06, |
| "loss": 0.0, |
| "num_tokens": 171904.0, |
| "reward": 0.5, |
| "reward_std": 0.5345224738121033, |
| "rewards/compute_reward/mean": 0.5, |
| "rewards/compute_reward/std": 0.5345224738121033, |
| "step": 68 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.010595823095823095, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 5.164610774954781e-05, |
| "kl": 0.0018625680822879076, |
| "learning_rate": 3.4000000000000005e-06, |
| "loss": 0.0, |
| "num_tokens": 174432.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 69 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.01074938574938575, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2751414179801941, |
| "kl": 0.0016131119336932898, |
| "learning_rate": 3.45e-06, |
| "loss": 0.0, |
| "num_tokens": 176960.0, |
| "reward": 0.875, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.875, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 70 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.010902948402948403, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.301176518201828, |
| "kl": 0.0018663486698642373, |
| "learning_rate": 3.5e-06, |
| "loss": 0.0, |
| "num_tokens": 179488.0, |
| "reward": 0.625, |
| "reward_std": 0.5175491571426392, |
| "rewards/compute_reward/mean": 0.625, |
| "rewards/compute_reward/std": 0.5175492167472839, |
| "step": 71 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.011056511056511056, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.354660995886661e-05, |
| "kl": 0.0017401627264916897, |
| "learning_rate": 3.5500000000000003e-06, |
| "loss": 0.0, |
| "num_tokens": 182016.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 72 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.01121007371007371, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 5.4372500017052516e-05, |
| "kl": 0.0021832087077200413, |
| "learning_rate": 3.6000000000000003e-06, |
| "loss": 0.0, |
| "num_tokens": 184544.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 73 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.011363636363636364, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 3.5604072763817385e-05, |
| "kl": 0.0017031468451023102, |
| "learning_rate": 3.65e-06, |
| "loss": 0.0, |
| "num_tokens": 187072.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 74 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.011517199017199017, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.006764743477106094, |
| "kl": 0.01521103922277689, |
| "learning_rate": 3.7e-06, |
| "loss": 0.0, |
| "num_tokens": 189600.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 75 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.01167076167076167, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 3.919710798072629e-05, |
| "kl": 0.001588721526786685, |
| "learning_rate": 3.7500000000000005e-06, |
| "loss": 0.0, |
| "num_tokens": 192128.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 76 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.011824324324324325, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.283232718706131, |
| "kl": 0.0022833647672086954, |
| "learning_rate": 3.8000000000000005e-06, |
| "loss": 0.0, |
| "num_tokens": 194656.0, |
| "reward": 0.875, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.875, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 77 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.011977886977886978, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 8.2063750596717e-05, |
| "kl": 0.0021289209835231304, |
| "learning_rate": 3.85e-06, |
| "loss": 0.0, |
| "num_tokens": 197184.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 78 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.012131449631449631, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23820413649082184, |
| "kl": 0.0019389790249988437, |
| "learning_rate": 3.900000000000001e-06, |
| "loss": 0.0, |
| "num_tokens": 199712.0, |
| "reward": 0.125, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.125, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 79 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.012285012285012284, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 3.6227207601768896e-05, |
| "kl": 0.0018583569908514619, |
| "learning_rate": 3.95e-06, |
| "loss": 0.0, |
| "num_tokens": 202240.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 80 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.012438574938574939, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 3.6503493902273476e-05, |
| "kl": 0.0019133782479912043, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 0.0, |
| "num_tokens": 204768.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 81 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.012592137592137592, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 5.445562055683695e-05, |
| "kl": 0.001825973391532898, |
| "learning_rate": 4.05e-06, |
| "loss": 0.0, |
| "num_tokens": 207296.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 82 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.012745700245700245, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.9896567361429334e-05, |
| "kl": 0.0016772599192336202, |
| "learning_rate": 4.1e-06, |
| "loss": 0.0, |
| "num_tokens": 209824.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 83 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.012899262899262898, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26036882400512695, |
| "kl": 0.0018029706552624702, |
| "learning_rate": 4.15e-06, |
| "loss": 0.0, |
| "num_tokens": 212352.0, |
| "reward": 0.5, |
| "reward_std": 0.5345224738121033, |
| "rewards/compute_reward/mean": 0.5, |
| "rewards/compute_reward/std": 0.5345224738121033, |
| "step": 84 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.013052825552825553, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 5.36849838681519e-05, |
| "kl": 0.0016535220202058554, |
| "learning_rate": 4.2000000000000004e-06, |
| "loss": 0.0, |
| "num_tokens": 214880.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 85 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.013206388206388206, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 5.508507092599757e-05, |
| "kl": 0.0019267231691628695, |
| "learning_rate": 4.25e-06, |
| "loss": 0.0, |
| "num_tokens": 217408.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 86 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.01335995085995086, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 3.793669384322129e-05, |
| "kl": 0.0014971359632909298, |
| "learning_rate": 4.3e-06, |
| "loss": 0.0, |
| "num_tokens": 219936.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 87 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.013513513513513514, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.27244484424591064, |
| "kl": 0.0018051433144137263, |
| "learning_rate": 4.350000000000001e-06, |
| "loss": 0.0, |
| "num_tokens": 222464.0, |
| "reward": 0.25, |
| "reward_std": 0.4629100561141968, |
| "rewards/compute_reward/mean": 0.25, |
| "rewards/compute_reward/std": 0.4629100561141968, |
| "step": 88 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.013667076167076167, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 3.993747668573633e-05, |
| "kl": 0.0018220169004052877, |
| "learning_rate": 4.4e-06, |
| "loss": 0.0, |
| "num_tokens": 224992.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 89 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.01382063882063882, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 8.624274050816894e-05, |
| "kl": 0.0022467547096312046, |
| "learning_rate": 4.450000000000001e-06, |
| "loss": 0.0, |
| "num_tokens": 227520.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 90 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.013974201474201474, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.577819709083997e-05, |
| "kl": 0.002002157736569643, |
| "learning_rate": 4.5e-06, |
| "loss": 0.0, |
| "num_tokens": 230048.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 91 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.014127764127764128, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.1842817154247314e-05, |
| "kl": 0.0017484177369624376, |
| "learning_rate": 4.5500000000000005e-06, |
| "loss": 0.0, |
| "num_tokens": 232576.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 92 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.014281326781326781, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2512575685977936, |
| "kl": 0.0015832895878702402, |
| "learning_rate": 4.600000000000001e-06, |
| "loss": 0.0, |
| "num_tokens": 235104.0, |
| "reward": 0.625, |
| "reward_std": 0.5175491571426392, |
| "rewards/compute_reward/mean": 0.625, |
| "rewards/compute_reward/std": 0.5175492167472839, |
| "step": 93 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.014434889434889435, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 7.139512308640406e-05, |
| "kl": 0.0019443321507424116, |
| "learning_rate": 4.65e-06, |
| "loss": 0.0, |
| "num_tokens": 237632.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 94 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.014588452088452088, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.431122579262592e-05, |
| "kl": 0.0016710343770682812, |
| "learning_rate": 4.7e-06, |
| "loss": 0.0, |
| "num_tokens": 240160.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 95 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.014742014742014743, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.648687900044024e-05, |
| "kl": 0.0017124009318649769, |
| "learning_rate": 4.75e-06, |
| "loss": 0.0, |
| "num_tokens": 242688.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 96 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.014895577395577396, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 5.7402332458877936e-05, |
| "kl": 0.0018351635662838817, |
| "learning_rate": 4.800000000000001e-06, |
| "loss": 0.0, |
| "num_tokens": 245216.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 97 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.015049140049140049, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2951975166797638, |
| "kl": 0.001936566550284624, |
| "learning_rate": 4.85e-06, |
| "loss": 0.0, |
| "num_tokens": 247744.0, |
| "reward": 0.125, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.125, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 98 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.015202702702702704, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 3.6721663491334766e-05, |
| "kl": 0.0014798862393945456, |
| "learning_rate": 4.9000000000000005e-06, |
| "loss": 0.0, |
| "num_tokens": 250272.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 99 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.015356265356265357, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2804346978664398, |
| "kl": 0.0018570123938843608, |
| "learning_rate": 4.95e-06, |
| "loss": 0.0, |
| "num_tokens": 252800.0, |
| "reward": 0.875, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.875, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 100 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.01550982800982801, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 5.16461550432723e-05, |
| "kl": 0.0020596638787537813, |
| "learning_rate": 5e-06, |
| "loss": 0.0, |
| "num_tokens": 255328.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 101 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.015663390663390665, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3104115426540375, |
| "kl": 0.0018204101361334324, |
| "learning_rate": 4.999984769144476e-06, |
| "loss": 0.0, |
| "num_tokens": 257856.0, |
| "reward": 0.125, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.125, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 102 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.015816953316953316, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24072949588298798, |
| "kl": 0.0018243859522044659, |
| "learning_rate": 4.999939076763487e-06, |
| "loss": 0.0, |
| "num_tokens": 260384.0, |
| "reward": 0.125, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.125, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 103 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.01597051597051597, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24348726868629456, |
| "kl": 0.0016441608313471079, |
| "learning_rate": 4.999862923413781e-06, |
| "loss": 0.0, |
| "num_tokens": 262912.0, |
| "reward": 0.375, |
| "reward_std": 0.5175491571426392, |
| "rewards/compute_reward/mean": 0.375, |
| "rewards/compute_reward/std": 0.5175492167472839, |
| "step": 104 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.016124078624078626, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.530456499196589e-05, |
| "kl": 0.0015811467310413718, |
| "learning_rate": 4.999756310023261e-06, |
| "loss": 0.0, |
| "num_tokens": 265440.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 105 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.016277641277641277, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 3.9636288420297205e-05, |
| "kl": 0.001855593640357256, |
| "learning_rate": 4.9996192378909785e-06, |
| "loss": 0.0, |
| "num_tokens": 267968.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 106 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.016431203931203932, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 6.759299867553636e-05, |
| "kl": 0.002279892098158598, |
| "learning_rate": 4.999451708687114e-06, |
| "loss": 0.0, |
| "num_tokens": 270496.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 107 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.016584766584766583, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 8.02454596851021e-05, |
| "kl": 0.0017966879531741142, |
| "learning_rate": 4.9992537244529585e-06, |
| "loss": 0.0, |
| "num_tokens": 273024.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 108 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.016738329238329238, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 7.645921141374856e-05, |
| "kl": 0.0018427056493237615, |
| "learning_rate": 4.999025287600886e-06, |
| "loss": 0.0, |
| "num_tokens": 275552.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 109 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.016891891891891893, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26137876510620117, |
| "kl": 0.0018832421628758311, |
| "learning_rate": 4.998766400914329e-06, |
| "loss": 0.0, |
| "num_tokens": 278080.0, |
| "reward": 0.25, |
| "reward_std": 0.4629100561141968, |
| "rewards/compute_reward/mean": 0.25, |
| "rewards/compute_reward/std": 0.4629100561141968, |
| "step": 110 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.017045454545454544, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.3144853407284245e-05, |
| "kl": 0.0019343274179846048, |
| "learning_rate": 4.99847706754774e-06, |
| "loss": 0.0, |
| "num_tokens": 280608.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 111 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.0171990171990172, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.296404629945755, |
| "kl": 0.0017289104871451855, |
| "learning_rate": 4.998157291026553e-06, |
| "loss": 0.0, |
| "num_tokens": 283136.0, |
| "reward": 0.875, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.875, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 112 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.017352579852579854, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 6.901392043801025e-05, |
| "kl": 0.001817359123378992, |
| "learning_rate": 4.997807075247147e-06, |
| "loss": 0.0, |
| "num_tokens": 285664.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 113 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.017506142506142505, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 9.294539631810039e-05, |
| "kl": 0.0021493504755198956, |
| "learning_rate": 4.997426424476787e-06, |
| "loss": 0.0, |
| "num_tokens": 288192.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 114 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.01765970515970516, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3077925741672516, |
| "kl": 0.0031500193290412426, |
| "learning_rate": 4.9970153433535855e-06, |
| "loss": 0.0, |
| "num_tokens": 290720.0, |
| "reward": 0.875, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.875, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 115 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.017813267813267815, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 7.058670598780736e-05, |
| "kl": 0.001972172874957323, |
| "learning_rate": 4.9965738368864345e-06, |
| "loss": 0.0, |
| "num_tokens": 293248.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 116 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.017966830466830466, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 3.9960759750101715e-05, |
| "kl": 0.0017217604909092188, |
| "learning_rate": 4.996101910454953e-06, |
| "loss": 0.0, |
| "num_tokens": 295776.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 117 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.01812039312039312, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.5281241909833625e-05, |
| "kl": 0.002011762699112296, |
| "learning_rate": 4.995599569809414e-06, |
| "loss": 0.0, |
| "num_tokens": 298304.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 118 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.018273955773955772, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 5.574916212935932e-05, |
| "kl": 0.001746984082274139, |
| "learning_rate": 4.9950668210706795e-06, |
| "loss": 0.0, |
| "num_tokens": 300832.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 119 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.018427518427518427, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 5.778512422693893e-05, |
| "kl": 0.0020345754455775023, |
| "learning_rate": 4.994503670730126e-06, |
| "loss": 0.0, |
| "num_tokens": 303360.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 120 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.018581081081081082, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 6.11362120253034e-05, |
| "kl": 0.0017627079505473375, |
| "learning_rate": 4.993910125649561e-06, |
| "loss": 0.0, |
| "num_tokens": 305888.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 121 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.018734643734643733, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2692274749279022, |
| "kl": 0.00172503013163805, |
| "learning_rate": 4.993286193061145e-06, |
| "loss": 0.0, |
| "num_tokens": 308416.0, |
| "reward": 0.375, |
| "reward_std": 0.5175491571426392, |
| "rewards/compute_reward/mean": 0.375, |
| "rewards/compute_reward/std": 0.5175492167472839, |
| "step": 122 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.01888820638820639, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26355016231536865, |
| "kl": 0.0016964760143309832, |
| "learning_rate": 4.992631880567301e-06, |
| "loss": 0.0, |
| "num_tokens": 310944.0, |
| "reward": 0.5, |
| "reward_std": 0.5345224738121033, |
| "rewards/compute_reward/mean": 0.5, |
| "rewards/compute_reward/std": 0.5345224738121033, |
| "step": 123 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.019041769041769043, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2689116895198822, |
| "kl": 0.001957943197339773, |
| "learning_rate": 4.991947196140619e-06, |
| "loss": 0.0, |
| "num_tokens": 313472.0, |
| "reward": 0.375, |
| "reward_std": 0.5175491571426392, |
| "rewards/compute_reward/mean": 0.375, |
| "rewards/compute_reward/std": 0.5175492167472839, |
| "step": 124 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.019195331695331695, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.229343980550766, |
| "kl": 0.0017839998472481966, |
| "learning_rate": 4.9912321481237616e-06, |
| "loss": 0.0, |
| "num_tokens": 316000.0, |
| "reward": 0.875, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.875, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 125 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.01934889434889435, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 6.424494495149702e-05, |
| "kl": 0.001683582435362041, |
| "learning_rate": 4.990486745229364e-06, |
| "loss": 0.0, |
| "num_tokens": 318528.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 126 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.019502457002457004, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.912633812637068e-05, |
| "kl": 0.001943158800713718, |
| "learning_rate": 4.989710996539926e-06, |
| "loss": 0.0, |
| "num_tokens": 321056.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 127 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.019656019656019656, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.31882649660110474, |
| "kl": 0.0018522620666772127, |
| "learning_rate": 4.9889049115077e-06, |
| "loss": 0.0, |
| "num_tokens": 323584.0, |
| "reward": 0.125, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.125, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 128 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.01980958230958231, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.29109644889831543, |
| "kl": 0.0019171757157891989, |
| "learning_rate": 4.988068499954578e-06, |
| "loss": 0.0, |
| "num_tokens": 326112.0, |
| "reward": 0.25, |
| "reward_std": 0.4629100561141968, |
| "rewards/compute_reward/mean": 0.25, |
| "rewards/compute_reward/std": 0.4629100561141968, |
| "step": 129 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.01996314496314496, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 6.321538239717484e-05, |
| "kl": 0.00218316912651062, |
| "learning_rate": 4.987201772071971e-06, |
| "loss": 0.0, |
| "num_tokens": 328640.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 130 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.020116707616707617, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 3.9103619201341644e-05, |
| "kl": 0.0019879359751939774, |
| "learning_rate": 4.986304738420684e-06, |
| "loss": 0.0, |
| "num_tokens": 331168.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 131 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.02027027027027027, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2476845532655716, |
| "kl": 0.0018740994855761528, |
| "learning_rate": 4.985377409930789e-06, |
| "loss": 0.0, |
| "num_tokens": 333696.0, |
| "reward": 0.875, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.875, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 132 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.020423832923832923, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2667451798915863, |
| "kl": 0.0017065267311409116, |
| "learning_rate": 4.984419797901491e-06, |
| "loss": 0.0, |
| "num_tokens": 336224.0, |
| "reward": 0.5, |
| "reward_std": 0.5345224738121033, |
| "rewards/compute_reward/mean": 0.5, |
| "rewards/compute_reward/std": 0.5345224738121033, |
| "step": 133 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.020577395577395578, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26921480894088745, |
| "kl": 0.0016423269407823682, |
| "learning_rate": 4.983431914000991e-06, |
| "loss": 0.0, |
| "num_tokens": 338752.0, |
| "reward": 0.5, |
| "reward_std": 0.5345224738121033, |
| "rewards/compute_reward/mean": 0.5, |
| "rewards/compute_reward/std": 0.5345224738121033, |
| "step": 134 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.020730958230958232, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.28913527727127075, |
| "kl": 0.0017424484249204397, |
| "learning_rate": 4.9824137702663424e-06, |
| "loss": 0.0, |
| "num_tokens": 341280.0, |
| "reward": 0.125, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.125, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 135 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.020884520884520884, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2528924345970154, |
| "kl": 0.002046446083113551, |
| "learning_rate": 4.981365379103306e-06, |
| "loss": 0.0, |
| "num_tokens": 343808.0, |
| "reward": 0.125, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.125, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 136 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.02103808353808354, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.9009151553036645e-05, |
| "kl": 0.0018441956490278244, |
| "learning_rate": 4.980286753286196e-06, |
| "loss": 0.0, |
| "num_tokens": 346336.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 137 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.02119164619164619, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 3.832565198536031e-05, |
| "kl": 0.001864951686002314, |
| "learning_rate": 4.979177905957726e-06, |
| "loss": 0.0, |
| "num_tokens": 348864.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 138 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.021345208845208845, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2377214878797531, |
| "kl": 0.0015864957822486758, |
| "learning_rate": 4.978038850628855e-06, |
| "loss": 0.0, |
| "num_tokens": 351392.0, |
| "reward": 0.625, |
| "reward_std": 0.5175491571426392, |
| "rewards/compute_reward/mean": 0.625, |
| "rewards/compute_reward/std": 0.5175492167472839, |
| "step": 139 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.0214987714987715, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.7819783503655344e-05, |
| "kl": 0.0015667935367673635, |
| "learning_rate": 4.9768696011786095e-06, |
| "loss": 0.0, |
| "num_tokens": 353920.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 140 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.02165233415233415, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.9188554839929566e-05, |
| "kl": 0.0015942950267344713, |
| "learning_rate": 4.975670171853926e-06, |
| "loss": 0.0, |
| "num_tokens": 356448.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 141 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.021805896805896806, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.5208802475826815e-05, |
| "kl": 0.0018722546519711614, |
| "learning_rate": 4.974440577269473e-06, |
| "loss": 0.0, |
| "num_tokens": 358976.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 142 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.02195945945945946, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 5.318508920026943e-05, |
| "kl": 0.0016882454510778189, |
| "learning_rate": 4.973180832407471e-06, |
| "loss": 0.0, |
| "num_tokens": 361504.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 143 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.022113022113022112, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24421143531799316, |
| "kl": 0.0019481799099594355, |
| "learning_rate": 4.971890952617515e-06, |
| "loss": 0.0, |
| "num_tokens": 364032.0, |
| "reward": 0.125, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.125, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 144 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.022266584766584767, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 8.330932905664667e-05, |
| "kl": 0.0020749252289533615, |
| "learning_rate": 4.970570953616383e-06, |
| "loss": 0.0, |
| "num_tokens": 366560.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 145 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.02242014742014742, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23043030500411987, |
| "kl": 0.0018000759882852435, |
| "learning_rate": 4.9692208514878445e-06, |
| "loss": 0.0, |
| "num_tokens": 369088.0, |
| "reward": 0.75, |
| "reward_std": 0.4629100561141968, |
| "rewards/compute_reward/mean": 0.75, |
| "rewards/compute_reward/std": 0.4629100561141968, |
| "step": 146 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.022573710073710073, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.8827070713741705e-05, |
| "kl": 0.0021186263766139746, |
| "learning_rate": 4.96784066268247e-06, |
| "loss": 0.0, |
| "num_tokens": 371616.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 147 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.022727272727272728, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 6.167725950945169e-05, |
| "kl": 0.0021395478397607803, |
| "learning_rate": 4.966430404017424e-06, |
| "loss": 0.0, |
| "num_tokens": 374144.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 148 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.02288083538083538, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 5.823269748361781e-05, |
| "kl": 0.0019843820482492447, |
| "learning_rate": 4.964990092676263e-06, |
| "loss": 0.0, |
| "num_tokens": 376672.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 149 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.023034398034398034, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 8.367564441869035e-05, |
| "kl": 0.0018627950921654701, |
| "learning_rate": 4.963519746208726e-06, |
| "loss": 0.0, |
| "num_tokens": 379200.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 150 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.02318796068796069, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 6.863860471639782e-05, |
| "kl": 0.0017451518215239048, |
| "learning_rate": 4.962019382530521e-06, |
| "loss": 0.0, |
| "num_tokens": 381728.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 151 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.02334152334152334, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25255465507507324, |
| "kl": 0.0014641667949035764, |
| "learning_rate": 4.960489019923105e-06, |
| "loss": 0.0, |
| "num_tokens": 384256.0, |
| "reward": 0.125, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.125, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 152 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.023495085995085995, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.33457645773887634, |
| "kl": 0.0019125788239762187, |
| "learning_rate": 4.958928677033465e-06, |
| "loss": 0.0, |
| "num_tokens": 386784.0, |
| "reward": 0.125, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.125, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 153 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.02364864864864865, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 5.346098987502046e-05, |
| "kl": 0.0018906190525740385, |
| "learning_rate": 4.957338372873886e-06, |
| "loss": 0.0, |
| "num_tokens": 389312.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 154 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.0238022113022113, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 3.772399577428587e-05, |
| "kl": 0.0018508832436054945, |
| "learning_rate": 4.9557181268217225e-06, |
| "loss": 0.0, |
| "num_tokens": 391840.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 155 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.023955773955773956, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25293248891830444, |
| "kl": 0.0018264721147716045, |
| "learning_rate": 4.9540679586191605e-06, |
| "loss": 0.0, |
| "num_tokens": 394368.0, |
| "reward": 0.875, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.875, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 156 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.02410933660933661, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 5.8626916143111885e-05, |
| "kl": 0.0015982667682692409, |
| "learning_rate": 4.9523878883729794e-06, |
| "loss": 0.0, |
| "num_tokens": 396896.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 157 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.024262899262899262, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 7.256789831444621e-05, |
| "kl": 0.001972634345293045, |
| "learning_rate": 4.9506779365543054e-06, |
| "loss": 0.0, |
| "num_tokens": 399424.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 158 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.024416461916461917, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.3696527427528054e-05, |
| "kl": 0.0015835874946787953, |
| "learning_rate": 4.94893812399836e-06, |
| "loss": 0.0, |
| "num_tokens": 401952.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 159 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.02457002457002457, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.538014763966203e-05, |
| "kl": 0.0016496441094204783, |
| "learning_rate": 4.947168471904213e-06, |
| "loss": 0.0, |
| "num_tokens": 404480.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 160 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.024723587223587223, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2756509482860565, |
| "kl": 0.0019514348823577166, |
| "learning_rate": 4.9453690018345144e-06, |
| "loss": 0.0, |
| "num_tokens": 407008.0, |
| "reward": 0.375, |
| "reward_std": 0.5175491571426392, |
| "rewards/compute_reward/mean": 0.375, |
| "rewards/compute_reward/std": 0.5175492167472839, |
| "step": 161 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.024877149877149878, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 5.101573697174899e-05, |
| "kl": 0.0015867715701460838, |
| "learning_rate": 4.9435397357152406e-06, |
| "loss": 0.0, |
| "num_tokens": 409536.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 162 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.02503071253071253, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.291233926778659e-05, |
| "kl": 0.0017499066889286041, |
| "learning_rate": 4.9416806958354206e-06, |
| "loss": 0.0, |
| "num_tokens": 412064.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 163 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.025184275184275184, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.30920252203941345, |
| "kl": 0.0018755393102765083, |
| "learning_rate": 4.939791904846869e-06, |
| "loss": 0.0, |
| "num_tokens": 414592.0, |
| "reward": 0.75, |
| "reward_std": 0.4629100561141968, |
| "rewards/compute_reward/mean": 0.75, |
| "rewards/compute_reward/std": 0.4629100561141968, |
| "step": 164 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.02533783783783784, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3231056034564972, |
| "kl": 0.0015844641020521522, |
| "learning_rate": 4.937873385763909e-06, |
| "loss": 0.0, |
| "num_tokens": 417120.0, |
| "reward": 0.375, |
| "reward_std": 0.5175491571426392, |
| "rewards/compute_reward/mean": 0.375, |
| "rewards/compute_reward/std": 0.5175492167472839, |
| "step": 165 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.02549140049140049, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 6.7320310336072e-05, |
| "kl": 0.0019866579677909613, |
| "learning_rate": 4.935925161963089e-06, |
| "loss": 0.0, |
| "num_tokens": 419648.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 166 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.025644963144963145, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 6.170530832605436e-05, |
| "kl": 0.0020753752905875444, |
| "learning_rate": 4.933947257182901e-06, |
| "loss": 0.0, |
| "num_tokens": 422176.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 167 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.025798525798525797, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 8.350784628419206e-05, |
| "kl": 0.0021149544045329094, |
| "learning_rate": 4.9319396955234925e-06, |
| "loss": 0.0, |
| "num_tokens": 424704.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 168 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.02595208845208845, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 7.291053043445572e-05, |
| "kl": 0.0020025530830025673, |
| "learning_rate": 4.9299025014463665e-06, |
| "loss": 0.0, |
| "num_tokens": 427232.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 169 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.026105651105651106, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.736949995276518e-05, |
| "kl": 0.0020428327843546867, |
| "learning_rate": 4.92783569977409e-06, |
| "loss": 0.0, |
| "num_tokens": 429760.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 170 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.026259213759213758, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0001171924959635362, |
| "kl": 0.0021608027163892984, |
| "learning_rate": 4.925739315689991e-06, |
| "loss": 0.0, |
| "num_tokens": 432288.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 171 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.026412776412776413, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.31147199869155884, |
| "kl": 0.002245452953502536, |
| "learning_rate": 4.923613374737848e-06, |
| "loss": 0.0, |
| "num_tokens": 434816.0, |
| "reward": 0.625, |
| "reward_std": 0.5175491571426392, |
| "rewards/compute_reward/mean": 0.625, |
| "rewards/compute_reward/std": 0.5175492167472839, |
| "step": 172 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.026566339066339067, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 3.6620618629967794e-05, |
| "kl": 0.0016759500140324235, |
| "learning_rate": 4.921457902821578e-06, |
| "loss": 0.0, |
| "num_tokens": 437344.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 173 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.02671990171990172, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 6.209245475474745e-05, |
| "kl": 0.001675160019658506, |
| "learning_rate": 4.9192729262049285e-06, |
| "loss": 0.0, |
| "num_tokens": 439872.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 174 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.026873464373464374, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2695581316947937, |
| "kl": 0.0020962983835488558, |
| "learning_rate": 4.917058471511149e-06, |
| "loss": 0.0, |
| "num_tokens": 442400.0, |
| "reward": 0.5, |
| "reward_std": 0.5345224738121033, |
| "rewards/compute_reward/mean": 0.5, |
| "rewards/compute_reward/std": 0.5345224738121033, |
| "step": 175 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.02702702702702703, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.30832597613334656, |
| "kl": 0.0018510306254029274, |
| "learning_rate": 4.914814565722671e-06, |
| "loss": 0.0, |
| "num_tokens": 444928.0, |
| "reward": 0.25, |
| "reward_std": 0.4629100561141968, |
| "rewards/compute_reward/mean": 0.25, |
| "rewards/compute_reward/std": 0.4629100561141968, |
| "step": 176 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.02718058968058968, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2357330024242401, |
| "kl": 0.0017001696396619081, |
| "learning_rate": 4.912541236180779e-06, |
| "loss": 0.0, |
| "num_tokens": 447456.0, |
| "reward": 0.125, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.125, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 177 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.027334152334152335, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 6.062870306777768e-05, |
| "kl": 0.0015750662423670292, |
| "learning_rate": 4.910238510585275e-06, |
| "loss": 0.0, |
| "num_tokens": 449984.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 178 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.027487714987714986, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.00010732597002061084, |
| "kl": 0.0025620560627430677, |
| "learning_rate": 4.907906416994146e-06, |
| "loss": 0.0, |
| "num_tokens": 452512.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 179 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.02764127764127764, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.33509257435798645, |
| "kl": 0.0022440755274146795, |
| "learning_rate": 4.905544983823214e-06, |
| "loss": 0.0, |
| "num_tokens": 455040.0, |
| "reward": 0.875, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.875, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 180 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.027794840294840296, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 6.418764678528532e-05, |
| "kl": 0.0018326432909816504, |
| "learning_rate": 4.903154239845798e-06, |
| "loss": 0.0, |
| "num_tokens": 457568.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 181 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.027948402948402947, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 6.49908251943998e-05, |
| "kl": 0.0018245319370180368, |
| "learning_rate": 4.900734214192358e-06, |
| "loss": 0.0, |
| "num_tokens": 460096.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 182 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.028101965601965602, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 7.694274972891435e-05, |
| "kl": 0.002267412142828107, |
| "learning_rate": 4.898284936350144e-06, |
| "loss": 0.0, |
| "num_tokens": 462624.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 183 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.028255528255528257, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 7.338741852436215e-05, |
| "kl": 0.002028039190918207, |
| "learning_rate": 4.8958064361628334e-06, |
| "loss": 0.0, |
| "num_tokens": 465152.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 184 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.028409090909090908, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25913721323013306, |
| "kl": 0.002066713524982333, |
| "learning_rate": 4.893298743830168e-06, |
| "loss": 0.0, |
| "num_tokens": 467680.0, |
| "reward": 0.875, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.875, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 185 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.028562653562653563, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.27374428510665894, |
| "kl": 0.0019455322762951255, |
| "learning_rate": 4.890761889907589e-06, |
| "loss": 0.0, |
| "num_tokens": 470208.0, |
| "reward": 0.75, |
| "reward_std": 0.4629100561141968, |
| "rewards/compute_reward/mean": 0.75, |
| "rewards/compute_reward/std": 0.4629100561141968, |
| "step": 186 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.028716216216216218, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 5.599364885711111e-05, |
| "kl": 0.001901011448353529, |
| "learning_rate": 4.888195905305859e-06, |
| "loss": 0.0, |
| "num_tokens": 472736.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 187 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.02886977886977887, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.30285513401031494, |
| "kl": 0.0020216817501932383, |
| "learning_rate": 4.885600821290692e-06, |
| "loss": 0.0, |
| "num_tokens": 475264.0, |
| "reward": 0.375, |
| "reward_std": 0.5175491571426392, |
| "rewards/compute_reward/mean": 0.375, |
| "rewards/compute_reward/std": 0.5175492167472839, |
| "step": 188 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.029023341523341524, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 4.9795729864854366e-05, |
| "kl": 0.002074337564408779, |
| "learning_rate": 4.882976669482368e-06, |
| "loss": 0.0, |
| "num_tokens": 477792.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 189 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.029176904176904175, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2235657274723053, |
| "kl": 0.002039581071585417, |
| "learning_rate": 4.880323481855347e-06, |
| "loss": 0.0, |
| "num_tokens": 480320.0, |
| "reward": 0.125, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.125, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 190 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.02933046683046683, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.30517762899398804, |
| "kl": 0.0021754128392785788, |
| "learning_rate": 4.8776412907378845e-06, |
| "loss": 0.0, |
| "num_tokens": 482848.0, |
| "reward": 0.875, |
| "reward_std": 0.3535533845424652, |
| "rewards/compute_reward/mean": 0.875, |
| "rewards/compute_reward/std": 0.3535533845424652, |
| "step": 191 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.029484029484029485, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 7.001228368608281e-05, |
| "kl": 0.001999365398660302, |
| "learning_rate": 4.874930128811631e-06, |
| "loss": 0.0, |
| "num_tokens": 485376.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 192 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.029637592137592136, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 7.662906136829406e-05, |
| "kl": 0.002293789992108941, |
| "learning_rate": 4.8721900291112415e-06, |
| "loss": 0.0, |
| "num_tokens": 487904.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 193 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.02979115479115479, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 5.919369141338393e-05, |
| "kl": 0.001975182443857193, |
| "learning_rate": 4.869421025023965e-06, |
| "loss": 0.0, |
| "num_tokens": 490432.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 194 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.029944717444717446, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 5.6734315876383334e-05, |
| "kl": 0.0021232955623418093, |
| "learning_rate": 4.866623150289241e-06, |
| "loss": 0.0, |
| "num_tokens": 492960.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 195 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.030098280098280097, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 6.38264391454868e-05, |
| "kl": 0.0018839393742382526, |
| "learning_rate": 4.863796438998293e-06, |
| "loss": 0.0, |
| "num_tokens": 495488.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 1.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 196 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.030251842751842752, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 6.795779336243868e-05, |
| "kl": 0.0018915513064712286, |
| "learning_rate": 4.860940925593703e-06, |
| "loss": 0.0, |
| "num_tokens": 498016.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 197 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.030405405405405407, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2879483699798584, |
| "kl": 0.0019047476816922426, |
| "learning_rate": 4.858056644869002e-06, |
| "loss": 0.0, |
| "num_tokens": 500544.0, |
| "reward": 0.625, |
| "reward_std": 0.5175491571426392, |
| "rewards/compute_reward/mean": 0.625, |
| "rewards/compute_reward/std": 0.5175492167472839, |
| "step": 198 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.03055896805896806, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 7.852609996916726e-05, |
| "kl": 0.002171017462387681, |
| "learning_rate": 4.855143631968242e-06, |
| "loss": 0.0, |
| "num_tokens": 503072.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/compute_reward/mean": 0.0, |
| "rewards/compute_reward/std": 0.0, |
| "step": 199 |
| }, |
| { |
| "completion_length": 256.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 256.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 256.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 256.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.030712530712530713, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2760181128978729, |
| "kl": 0.0020151452627032995, |
| "learning_rate": 4.852201922385564e-06, |
| "loss": 0.0, |
| "num_tokens": 505600.0, |
| "reward": 0.625, |
| "reward_std": 0.5175491571426392, |
| "rewards/compute_reward/mean": 0.625, |
| "rewards/compute_reward/std": 0.5175492167472839, |
| "step": 200 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 1000, |
| "num_input_tokens_seen": 505600, |
| "num_train_epochs": 1, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|