diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,15217 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.999111111111111, + "eval_steps": 500, + "global_step": 562, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 958.0, + "completions/mean_length": 1020.078125, + "completions/mean_terminated_length": 773.0, + "completions/min_length": 588.0, + "completions/min_terminated_length": 588.0, + "epoch": 0.0035555555555555557, + "grad_norm": 0.2771470571706861, + "kl": 0.0, + "learning_rate": 0.0, + "loss": 0.0, + "num_tokens": 149594.0, + "reward": 0.0078125, + "reward_std": 0.009021097794175148, + "rewards/equation_reward_func/mean": 0.015625, + "rewards/equation_reward_func/std": 0.12450689822435379, + "rewards/format_reward_func/mean": 0.0, + "rewards/format_reward_func/std": 0.0, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.0071111111111111115, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 2.941176470588235e-08, + "loss": 0.0, + "num_tokens": 299738.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/equation_reward_func/mean": 0.0, + "rewards/equation_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.0, + "rewards/format_reward_func/std": 0.0, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 660.0, + "completions/mean_length": 1014.9609375, + "completions/mean_terminated_length": 445.5, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.010666666666666666, + "grad_norm": 0.3910424013646084, + "kl": 0.0012340545654296875, + "learning_rate": 5.88235294117647e-08, + "loss": 0.0, + "num_tokens": 448737.0, + "reward": 0.01171875, + "reward_std": 0.016833597794175148, + "rewards/equation_reward_func/mean": 0.0234375, + "rewards/equation_reward_func/std": 0.15188287198543549, + "rewards/format_reward_func/mean": 0.0, + "rewards/format_reward_func/std": 0.0, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 1015.1328125, + "completions/mean_terminated_length": 456.5, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "epoch": 0.014222222222222223, + "grad_norm": 0.7477560337297564, + "kl": 0.0012416839599609375, + "learning_rate": 8.823529411764706e-08, + "loss": 0.0, + "num_tokens": 597734.0, + "reward": 0.0078125, + "reward_std": 0.015625, + "rewards/equation_reward_func/mean": 0.015625, + "rewards/equation_reward_func/std": 0.12450689822435379, + "rewards/format_reward_func/mean": 0.0, + "rewards/format_reward_func/std": 0.0, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.017777777777777778, + "grad_norm": 0.004210061181011468, + "kl": 0.0011844635009765625, + "learning_rate": 1.176470588235294e-07, + "loss": 0.0, + "num_tokens": 747806.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/equation_reward_func/mean": 0.0, + "rewards/equation_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.0, + "rewards/format_reward_func/std": 0.0, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.021333333333333333, + "grad_norm": 0.30463059980407714, + "kl": 0.0012011528015136719, + "learning_rate": 1.4705882352941175e-07, + "loss": 0.0, + "num_tokens": 897894.0, + "reward": 0.00390625, + "reward_std": 0.0078125, + "rewards/equation_reward_func/mean": 0.0078125, + "rewards/equation_reward_func/std": 0.0883883461356163, + "rewards/format_reward_func/mean": 0.0, + "rewards/format_reward_func/std": 0.0, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.024888888888888887, + "grad_norm": 0.9478569692121466, + "kl": 0.0013608932495117188, + "learning_rate": 1.764705882352941e-07, + "loss": 0.0, + "num_tokens": 1048030.0, + "reward": 0.00390625, + "reward_std": 0.0078125, + "rewards/equation_reward_func/mean": 0.0078125, + "rewards/equation_reward_func/std": 0.0883883461356163, + "rewards/format_reward_func/mean": 0.0, + "rewards/format_reward_func/std": 0.0, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9921875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 936.0, + "completions/mean_length": 1023.3125, + "completions/mean_terminated_length": 936.0, + "completions/min_length": 936.0, + "completions/min_terminated_length": 936.0, + "epoch": 0.028444444444444446, + "grad_norm": 0.0033097224749988386, + "kl": 0.0013580322265625, + "learning_rate": 2.0588235294117645e-07, + "loss": 0.0, + "num_tokens": 1198082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/equation_reward_func/mean": 0.0, + "rewards/equation_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.0, + "rewards/format_reward_func/std": 0.0, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.032, + "grad_norm": 0.3396430847864516, + "kl": 0.0011739730834960938, + "learning_rate": 2.352941176470588e-07, + "loss": 0.0, + "num_tokens": 1348102.0, + "reward": 0.0078125, + "reward_std": 0.015625, + "rewards/equation_reward_func/mean": 0.015625, + "rewards/equation_reward_func/std": 0.12450689822435379, + "rewards/format_reward_func/mean": 0.0, + "rewards/format_reward_func/std": 0.0, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.035555555555555556, + "grad_norm": 0.0019085271667908846, + "kl": 0.0013241767883300781, + "learning_rate": 2.6470588235294114e-07, + "loss": 0.0, + "num_tokens": 1498154.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/equation_reward_func/mean": 0.0, + "rewards/equation_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.0, + "rewards/format_reward_func/std": 0.0, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.03911111111111111, + "grad_norm": 0.12196802997866993, + "kl": 0.003757476806640625, + "learning_rate": 2.941176470588235e-07, + "loss": 0.0, + "num_tokens": 1648294.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/equation_reward_func/mean": 0.0, + "rewards/equation_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.0, + "rewards/format_reward_func/std": 0.0, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9921875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 779.0, + "completions/mean_length": 1022.0859375, + "completions/mean_terminated_length": 779.0, + "completions/min_length": 779.0, + "completions/min_terminated_length": 779.0, + "epoch": 0.042666666666666665, + "grad_norm": 0.353412956717465, + "kl": 0.0011844635009765625, + "learning_rate": 3.2352941176470586e-07, + "loss": 0.0, + "num_tokens": 1798157.0, + "reward": 0.0078125, + "reward_std": 0.015625, + "rewards/equation_reward_func/mean": 0.015625, + "rewards/equation_reward_func/std": 0.12450689822435379, + "rewards/format_reward_func/mean": 0.0, + "rewards/format_reward_func/std": 0.0, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9765625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 862.0, + "completions/mean_length": 1015.5, + "completions/mean_terminated_length": 661.3333740234375, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.04622222222222222, + "grad_norm": 0.4177192368926344, + "kl": 0.0013208389282226562, + "learning_rate": 3.529411764705882e-07, + "loss": 0.0019, + "num_tokens": 1947113.0, + "reward": 0.0078125, + "reward_std": 0.015625, + "rewards/equation_reward_func/mean": 0.015625, + "rewards/equation_reward_func/std": 0.12450689822435379, + "rewards/format_reward_func/mean": 0.0, + "rewards/format_reward_func/std": 0.0, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9921875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 1019.8984375, + "completions/mean_terminated_length": 499.0, + "completions/min_length": 499.0, + "completions/min_terminated_length": 499.0, + "epoch": 0.049777777777777775, + "grad_norm": 1.030360015389124, + "kl": 0.0012826919555664062, + "learning_rate": 3.8235294117647053e-07, + "loss": 0.0, + "num_tokens": 2096668.0, + "reward": 0.0078125, + "reward_std": 0.015625, + "rewards/equation_reward_func/mean": 0.0078125, + "rewards/equation_reward_func/std": 0.0883883461356163, + "rewards/format_reward_func/mean": 0.0078125, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.05333333333333334, + "grad_norm": 0.006130423266424807, + "kl": 0.001644134521484375, + "learning_rate": 4.117647058823529e-07, + "loss": 0.0, + "num_tokens": 2246772.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/equation_reward_func/mean": 0.0, + "rewards/equation_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.0, + "rewards/format_reward_func/std": 0.0, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.05688888888888889, + "grad_norm": 0.47084182350703124, + "kl": 0.0013189315795898438, + "learning_rate": 4.4117647058823526e-07, + "loss": 0.0, + "num_tokens": 2396868.0, + "reward": 0.0078125, + "reward_std": 0.015625, + "rewards/equation_reward_func/mean": 0.015625, + "rewards/equation_reward_func/std": 0.12450689822435379, + "rewards/format_reward_func/mean": 0.0, + "rewards/format_reward_func/std": 0.0, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.060444444444444446, + "grad_norm": 0.39485183865726337, + "kl": 0.0012559890747070312, + "learning_rate": 4.705882352941176e-07, + "loss": 0.0, + "num_tokens": 2546980.0, + "reward": 0.0078125, + "reward_std": 0.015625, + "rewards/equation_reward_func/mean": 0.015625, + "rewards/equation_reward_func/std": 0.12450689822435379, + "rewards/format_reward_func/mean": 0.0, + "rewards/format_reward_func/std": 0.0, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9921875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 1019.0, + "completions/mean_terminated_length": 384.0, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "epoch": 0.064, + "grad_norm": 0.5277276749582551, + "kl": 0.0015764236450195312, + "learning_rate": 5e-07, + "loss": 0.0, + "num_tokens": 2696456.0, + "reward": 0.01171875, + "reward_std": 0.016833597794175148, + "rewards/equation_reward_func/mean": 0.0234375, + "rewards/equation_reward_func/std": 0.15188287198543549, + "rewards/format_reward_func/mean": 0.0, + "rewards/format_reward_func/std": 0.0, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9921875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 922.0, + "completions/mean_length": 1023.203125, + "completions/mean_terminated_length": 922.0, + "completions/min_length": 922.0, + "completions/min_terminated_length": 922.0, + "epoch": 0.06755555555555555, + "grad_norm": 0.6327836514564609, + "kl": 0.0015048980712890625, + "learning_rate": 4.999958464872182e-07, + "loss": 0.0, + "num_tokens": 2846446.0, + "reward": 0.01171875, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.0234375, + "rewards/equation_reward_func/std": 0.15188287198543549, + "rewards/format_reward_func/mean": 0.0, + "rewards/format_reward_func/std": 0.0, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9921875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 974.0, + "completions/mean_length": 1023.609375, + "completions/mean_terminated_length": 974.0, + "completions/min_length": 974.0, + "completions/min_terminated_length": 974.0, + "epoch": 0.07111111111111111, + "grad_norm": 0.3191632805099472, + "kl": 0.0015163421630859375, + "learning_rate": 4.999833860868863e-07, + "loss": 0.0, + "num_tokens": 2996508.0, + "reward": 0.00390625, + "reward_std": 0.0078125, + "rewards/equation_reward_func/mean": 0.0078125, + "rewards/equation_reward_func/std": 0.0883883461356163, + "rewards/format_reward_func/mean": 0.0, + "rewards/format_reward_func/std": 0.0, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9921875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 612.0, + "completions/mean_length": 1020.78125, + "completions/mean_terminated_length": 612.0, + "completions/min_length": 612.0, + "completions/min_terminated_length": 612.0, + "epoch": 0.07466666666666667, + "grad_norm": 0.5895570635639454, + "kl": 0.00185394287109375, + "learning_rate": 4.999626192130396e-07, + "loss": 0.0, + "num_tokens": 3146148.0, + "reward": 0.015625, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.03125, + "rewards/equation_reward_func/std": 0.1746762990951538, + "rewards/format_reward_func/mean": 0.0, + "rewards/format_reward_func/std": 0.0, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9921875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 1017.4453125, + "completions/mean_terminated_length": 185.0, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.07822222222222222, + "grad_norm": 0.8536716379125333, + "kl": 0.002094268798828125, + "learning_rate": 4.99933546555722e-07, + "loss": -0.0036, + "num_tokens": 3295353.0, + "reward": 0.0234375, + "reward_std": 0.04027109593153, + "rewards/equation_reward_func/mean": 0.046875, + "rewards/equation_reward_func/std": 0.21220162510871887, + "rewards/format_reward_func/mean": 0.0, + "rewards/format_reward_func/std": 0.0, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9921875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 844.0, + "completions/mean_length": 1022.59375, + "completions/mean_terminated_length": 844.0, + "completions/min_length": 844.0, + "completions/min_terminated_length": 844.0, + "epoch": 0.08177777777777778, + "grad_norm": 0.761179697351255, + "kl": 0.0025005340576171875, + "learning_rate": 4.998961690809627e-07, + "loss": -0.0012, + "num_tokens": 3445333.0, + "reward": 0.015625, + "reward_std": 0.024646097794175148, + "rewards/equation_reward_func/mean": 0.0234375, + "rewards/equation_reward_func/std": 0.15188287198543549, + "rewards/format_reward_func/mean": 0.0078125, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08533333333333333, + "grad_norm": 0.9164427994280226, + "kl": 0.0030078887939453125, + "learning_rate": 4.998504880307444e-07, + "loss": 0.0, + "num_tokens": 3595429.0, + "reward": 0.03125, + "reward_std": 0.05589609593153, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.0, + "rewards/format_reward_func/std": 0.0, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9921875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 660.0, + "completions/mean_length": 1021.15625, + "completions/mean_terminated_length": 660.0, + "completions/min_length": 660.0, + "completions/min_terminated_length": 660.0, + "epoch": 0.08888888888888889, + "grad_norm": 0.49620338336363207, + "kl": 0.0037441253662109375, + "learning_rate": 4.997965049229614e-07, + "loss": 0.0, + "num_tokens": 3745113.0, + "reward": 0.01171875, + "reward_std": 0.016833597794175148, + "rewards/equation_reward_func/mean": 0.015625, + "rewards/equation_reward_func/std": 0.12450689822435379, + "rewards/format_reward_func/mean": 0.0078125, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.09244444444444444, + "grad_norm": 0.9470188431084309, + "kl": 0.0042877197265625, + "learning_rate": 4.997342215513703e-07, + "loss": 0.0, + "num_tokens": 3895201.0, + "reward": 0.0234375, + "reward_std": 0.04027109593153, + "rewards/equation_reward_func/mean": 0.046875, + "rewards/equation_reward_func/std": 0.21220162510871887, + "rewards/format_reward_func/mean": 0.0, + "rewards/format_reward_func/std": 0.0, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 896.0, + "completions/mean_length": 1017.71875, + "completions/mean_terminated_length": 622.0, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "epoch": 0.096, + "grad_norm": 0.9027646554963968, + "kl": 0.005214691162109375, + "learning_rate": 4.99663639985529e-07, + "loss": -0.0028, + "num_tokens": 4044461.0, + "reward": 0.0234375, + "reward_std": 0.04027109593153, + "rewards/equation_reward_func/mean": 0.0390625, + "rewards/equation_reward_func/std": 0.194504976272583, + "rewards/format_reward_func/mean": 0.0078125, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9765625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1010.0, + "completions/mean_length": 1016.2734375, + "completions/mean_terminated_length": 694.3333740234375, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "epoch": 0.09955555555555555, + "grad_norm": 1.1212987874003704, + "kl": 0.0063610076904296875, + "learning_rate": 4.995847625707292e-07, + "loss": -0.0012, + "num_tokens": 4193588.0, + "reward": 0.03515625, + "reward_std": 0.06370859593153, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.0078125, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.10311111111111111, + "grad_norm": 0.5105677644040013, + "kl": 0.0079345703125, + "learning_rate": 4.994975919279175e-07, + "loss": 0.0, + "num_tokens": 4343632.0, + "reward": 0.0078125, + "reward_std": 0.015625, + "rewards/equation_reward_func/mean": 0.0078125, + "rewards/equation_reward_func/std": 0.0883883461356163, + "rewards/format_reward_func/mean": 0.0078125, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.10666666666666667, + "grad_norm": 0.40495448135577117, + "kl": 0.010833740234375, + "learning_rate": 4.994021309536092e-07, + "loss": 0.0, + "num_tokens": 4493784.0, + "reward": 0.01171875, + "reward_std": 0.0078125, + "rewards/equation_reward_func/mean": 0.0234375, + "rewards/equation_reward_func/std": 0.15188287198543549, + "rewards/format_reward_func/mean": 0.0, + "rewards/format_reward_func/std": 0.0, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9921875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 806.0, + "completions/mean_length": 1022.296875, + "completions/mean_terminated_length": 806.0, + "completions/min_length": 806.0, + "completions/min_terminated_length": 806.0, + "epoch": 0.11022222222222222, + "grad_norm": 1.1342927264328857, + "kl": 0.010311126708984375, + "learning_rate": 4.992983828197911e-07, + "loss": 0.0, + "num_tokens": 4643598.0, + "reward": 0.0234375, + "reward_std": 0.04027109593153, + "rewards/equation_reward_func/mean": 0.0390625, + "rewards/equation_reward_func/std": 0.194504976272583, + "rewards/format_reward_func/mean": 0.0078125, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.11377777777777778, + "grad_norm": 0.7512859872904276, + "kl": 0.01190948486328125, + "learning_rate": 4.991863509738169e-07, + "loss": 0.0, + "num_tokens": 4793666.0, + "reward": 0.04296875, + "reward_std": 0.0661257952451706, + "rewards/equation_reward_func/mean": 0.078125, + "rewards/equation_reward_func/std": 0.2694226801395416, + "rewards/format_reward_func/mean": 0.0078125, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9921875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 1019.96875, + "completions/mean_terminated_length": 508.0, + "completions/min_length": 508.0, + "completions/min_terminated_length": 508.0, + "epoch": 0.11733333333333333, + "grad_norm": 0.9416577312187854, + "kl": 0.02165985107421875, + "learning_rate": 4.990660391382923e-07, + "loss": -0.0021, + "num_tokens": 4943234.0, + "reward": 0.046875, + "reward_std": 0.05831329524517059, + "rewards/equation_reward_func/mean": 0.0703125, + "rewards/equation_reward_func/std": 0.2566775679588318, + "rewards/format_reward_func/mean": 0.0234375, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12088888888888889, + "grad_norm": 0.7545248940723044, + "kl": 0.0414276123046875, + "learning_rate": 4.989374513109511e-07, + "loss": 0.0, + "num_tokens": 5093326.0, + "reward": 0.0390625, + "reward_std": 0.05589609593153, + "rewards/equation_reward_func/mean": 0.0703125, + "rewards/equation_reward_func/std": 0.2566775679588318, + "rewards/format_reward_func/mean": 0.0078125, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.12444444444444444, + "grad_norm": 0.7218584701517031, + "kl": 0.0307769775390625, + "learning_rate": 4.988005917645229e-07, + "loss": 0.0, + "num_tokens": 5243446.0, + "reward": 0.03515625, + "reward_std": 0.057104695588350296, + "rewards/equation_reward_func/mean": 0.0390625, + "rewards/equation_reward_func/std": 0.194504976272583, + "rewards/format_reward_func/mean": 0.03125, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.128, + "grad_norm": 0.8075422097889196, + "kl": 0.043731689453125, + "learning_rate": 4.986554650465906e-07, + "loss": 0.0, + "num_tokens": 5393610.0, + "reward": 0.02734375, + "reward_std": 0.04808359593153, + "rewards/equation_reward_func/mean": 0.046875, + "rewards/equation_reward_func/std": 0.21220162510871887, + "rewards/format_reward_func/mean": 0.0078125, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13155555555555556, + "grad_norm": 0.6475461270291981, + "kl": 0.044952392578125, + "learning_rate": 4.985020759794397e-07, + "loss": 0.0, + "num_tokens": 5543710.0, + "reward": 0.03515625, + "reward_std": 0.0546875, + "rewards/equation_reward_func/mean": 0.0703125, + "rewards/equation_reward_func/std": 0.2566775679588318, + "rewards/format_reward_func/mean": 0.0, + "rewards/format_reward_func/std": 0.0, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1351111111111111, + "grad_norm": 0.754339390837337, + "kl": 0.0526123046875, + "learning_rate": 4.983404296598978e-07, + "loss": 0.0001, + "num_tokens": 5693790.0, + "reward": 0.06640625, + "reward_std": 0.0973757952451706, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.0390625, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.13866666666666666, + "grad_norm": 0.7041423598747788, + "kl": 0.057220458984375, + "learning_rate": 4.981705314591655e-07, + "loss": 0.0001, + "num_tokens": 5843866.0, + "reward": 0.05859375, + "reward_std": 0.07272969186306, + "rewards/equation_reward_func/mean": 0.078125, + "rewards/equation_reward_func/std": 0.2694226801395416, + "rewards/format_reward_func/mean": 0.0390625, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.14222222222222222, + "grad_norm": 0.649800348452034, + "kl": 0.037445068359375, + "learning_rate": 4.979923870226372e-07, + "loss": 0.0, + "num_tokens": 5994018.0, + "reward": 0.0546875, + "reward_std": 0.07767495512962341, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.015625, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9921875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 673.0, + "completions/mean_length": 1021.2578125, + "completions/mean_terminated_length": 673.0, + "completions/min_length": 673.0, + "completions/min_terminated_length": 673.0, + "epoch": 0.14577777777777778, + "grad_norm": 0.9175028834738074, + "kl": 0.047149658203125, + "learning_rate": 4.978060022697148e-07, + "loss": 0.0, + "num_tokens": 6143759.0, + "reward": 0.1015625, + "reward_std": 0.12323048710823059, + "rewards/equation_reward_func/mean": 0.109375, + "rewards/equation_reward_func/std": 0.31333550810813904, + "rewards/format_reward_func/mean": 0.09375, + "rewards/format_reward_func/std": 0.29262590408325195, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9921875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 700.0, + "completions/mean_length": 1021.46875, + "completions/mean_terminated_length": 700.0, + "completions/min_length": 700.0, + "completions/min_terminated_length": 700.0, + "epoch": 0.14933333333333335, + "grad_norm": 0.8957971400137922, + "kl": 0.052337646484375, + "learning_rate": 4.976113833936098e-07, + "loss": 0.0039, + "num_tokens": 6293559.0, + "reward": 0.1015625, + "reward_std": 0.13169725239276886, + "rewards/equation_reward_func/mean": 0.125, + "rewards/equation_reward_func/std": 0.3320184051990509, + "rewards/format_reward_func/mean": 0.078125, + "rewards/format_reward_func/std": 0.2694226801395416, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15288888888888888, + "grad_norm": 2.0122883915276466, + "kl": 0.170257568359375, + "learning_rate": 4.974085368611381e-07, + "loss": 0.0002, + "num_tokens": 6443715.0, + "reward": 0.0703125, + "reward_std": 0.09803006052970886, + "rewards/equation_reward_func/mean": 0.0703125, + "rewards/equation_reward_func/std": 0.2566775679588318, + "rewards/format_reward_func/mean": 0.0703125, + "rewards/format_reward_func/std": 0.2566775679588318, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.15644444444444444, + "grad_norm": 7.0766541269315875, + "kl": 0.728851318359375, + "learning_rate": 4.971974694125051e-07, + "loss": 0.0007, + "num_tokens": 6593715.0, + "reward": 0.140625, + "reward_std": 0.1269671469926834, + "rewards/equation_reward_func/mean": 0.1171875, + "rewards/equation_reward_func/std": 0.322907418012619, + "rewards/format_reward_func/mean": 0.1640625, + "rewards/format_reward_func/std": 0.371787428855896, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16, + "grad_norm": 4.872646650485278, + "kl": 0.322509765625, + "learning_rate": 4.969781880610813e-07, + "loss": 0.0003, + "num_tokens": 6743827.0, + "reward": 0.1640625, + "reward_std": 0.19241680204868317, + "rewards/equation_reward_func/mean": 0.1484375, + "rewards/equation_reward_func/std": 0.356930136680603, + "rewards/format_reward_func/mean": 0.1796875, + "rewards/format_reward_func/std": 0.3854354918003082, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.16355555555555557, + "grad_norm": 1.0088881954713835, + "kl": 0.070465087890625, + "learning_rate": 4.967507000931702e-07, + "loss": 0.0001, + "num_tokens": 6893915.0, + "reward": 0.1953125, + "reward_std": 0.1971469223499298, + "rewards/equation_reward_func/mean": 0.1953125, + "rewards/equation_reward_func/std": 0.3979988098144531, + "rewards/format_reward_func/mean": 0.1953125, + "rewards/format_reward_func/std": 0.3979988098144531, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1671111111111111, + "grad_norm": 3.180876744240654, + "kl": 0.444976806640625, + "learning_rate": 4.965150130677651e-07, + "loss": 0.0004, + "num_tokens": 7043971.0, + "reward": 0.19921875, + "reward_std": 0.2139805108308792, + "rewards/equation_reward_func/mean": 0.1796875, + "rewards/equation_reward_func/std": 0.3854354918003082, + "rewards/format_reward_func/mean": 0.21875, + "rewards/format_reward_func/std": 0.41502299904823303, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17066666666666666, + "grad_norm": 1.2194553812166182, + "kl": 0.196197509765625, + "learning_rate": 4.962711348162987e-07, + "loss": 0.0002, + "num_tokens": 7194079.0, + "reward": 0.1640625, + "reward_std": 0.16161686182022095, + "rewards/equation_reward_func/mean": 0.15625, + "rewards/equation_reward_func/std": 0.3645188808441162, + "rewards/format_reward_func/mean": 0.171875, + "rewards/format_reward_func/std": 0.3787541687488556, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17422222222222222, + "grad_norm": 1.3058480703859512, + "kl": 0.093231201171875, + "learning_rate": 4.960190734423824e-07, + "loss": 0.0001, + "num_tokens": 7344127.0, + "reward": 0.2109375, + "reward_std": 0.2336813509464264, + "rewards/equation_reward_func/mean": 0.171875, + "rewards/equation_reward_func/std": 0.3787541687488556, + "rewards/format_reward_func/mean": 0.25, + "rewards/format_reward_func/std": 0.434714138507843, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.17777777777777778, + "grad_norm": 0.8822642087267891, + "kl": 0.10333251953125, + "learning_rate": 4.957588373215373e-07, + "loss": 0.0001, + "num_tokens": 7494299.0, + "reward": 0.14453125, + "reward_std": 0.15854540467262268, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.1953125, + "rewards/format_reward_func/std": 0.3979988098144531, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18133333333333335, + "grad_norm": 5.708085997498834, + "kl": 0.7578125, + "learning_rate": 4.954904351009156e-07, + "loss": 0.0008, + "num_tokens": 7644343.0, + "reward": 0.2421875, + "reward_std": 0.24566304683685303, + "rewards/equation_reward_func/mean": 0.203125, + "rewards/equation_reward_func/std": 0.40390563011169434, + "rewards/format_reward_func/mean": 0.28125, + "rewards/format_reward_func/std": 0.4513758420944214, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9921875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 1020.109375, + "completions/mean_terminated_length": 526.0, + "completions/min_length": 526.0, + "completions/min_terminated_length": 526.0, + "epoch": 0.18488888888888888, + "grad_norm": 1.323493532471514, + "kl": 0.12872314453125, + "learning_rate": 4.952138756990142e-07, + "loss": 0.0001, + "num_tokens": 7793957.0, + "reward": 0.19921875, + "reward_std": 0.19263195991516113, + "rewards/equation_reward_func/mean": 0.1328125, + "rewards/equation_reward_func/std": 0.3407054841518402, + "rewards/format_reward_func/mean": 0.265625, + "rewards/format_reward_func/std": 0.44340085983276367, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.18844444444444444, + "grad_norm": 1.4983556320698812, + "kl": 0.248046875, + "learning_rate": 4.949291683053768e-07, + "loss": 0.0002, + "num_tokens": 7943977.0, + "reward": 0.2109375, + "reward_std": 0.22092357277870178, + "rewards/equation_reward_func/mean": 0.171875, + "rewards/equation_reward_func/std": 0.3787541687488556, + "rewards/format_reward_func/mean": 0.25, + "rewards/format_reward_func/std": 0.434714138507843, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.192, + "grad_norm": 2.4977361409936805, + "kl": 0.43695068359375, + "learning_rate": 4.946363223802901e-07, + "loss": 0.0004, + "num_tokens": 8094081.0, + "reward": 0.2734375, + "reward_std": 0.211696058511734, + "rewards/equation_reward_func/mean": 0.2265625, + "rewards/equation_reward_func/std": 0.4202519655227661, + "rewards/format_reward_func/mean": 0.3203125, + "rewards/format_reward_func/std": 0.4684300124645233, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9921875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 797.0, + "completions/mean_length": 1022.2265625, + "completions/mean_terminated_length": 797.0, + "completions/min_length": 797.0, + "completions/min_terminated_length": 797.0, + "epoch": 0.19555555555555557, + "grad_norm": 1.6221417605008561, + "kl": 0.24462890625, + "learning_rate": 4.943353476544681e-07, + "loss": -0.0012, + "num_tokens": 8243906.0, + "reward": 0.23046875, + "reward_std": 0.19329938292503357, + "rewards/equation_reward_func/mean": 0.15625, + "rewards/equation_reward_func/std": 0.3645188808441162, + "rewards/format_reward_func/mean": 0.3046875, + "rewards/format_reward_func/std": 0.46208351850509644, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.1991111111111111, + "grad_norm": 2.6760159527432683, + "kl": 0.376220703125, + "learning_rate": 4.940262541287302e-07, + "loss": 0.0004, + "num_tokens": 8393978.0, + "reward": 0.21875, + "reward_std": 0.18384575843811035, + "rewards/equation_reward_func/mean": 0.15625, + "rewards/equation_reward_func/std": 0.3645188808441162, + "rewards/format_reward_func/mean": 0.28125, + "rewards/format_reward_func/std": 0.4513758420944214, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20266666666666666, + "grad_norm": 22.524324011242694, + "kl": 2.3280029296875, + "learning_rate": 4.937090520736671e-07, + "loss": 0.0023, + "num_tokens": 8544074.0, + "reward": 0.1953125, + "reward_std": 0.22807088494300842, + "rewards/equation_reward_func/mean": 0.1171875, + "rewards/equation_reward_func/std": 0.322907418012619, + "rewards/format_reward_func/mean": 0.2734375, + "rewards/format_reward_func/std": 0.447474867105484, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20622222222222222, + "grad_norm": 6.300577551599378, + "kl": 0.8486328125, + "learning_rate": 4.933837520293017e-07, + "loss": 0.0008, + "num_tokens": 8694182.0, + "reward": 0.22265625, + "reward_std": 0.20167279243469238, + "rewards/equation_reward_func/mean": 0.1484375, + "rewards/equation_reward_func/std": 0.356930136680603, + "rewards/format_reward_func/mean": 0.296875, + "rewards/format_reward_func/std": 0.45867621898651123, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.20977777777777779, + "grad_norm": 28.006158731494125, + "kl": 0.86181640625, + "learning_rate": 4.930503648047367e-07, + "loss": 0.0009, + "num_tokens": 8844278.0, + "reward": 0.2734375, + "reward_std": 0.2949552536010742, + "rewards/equation_reward_func/mean": 0.1953125, + "rewards/equation_reward_func/std": 0.3979988098144531, + "rewards/format_reward_func/mean": 0.3515625, + "rewards/format_reward_func/std": 0.4793342351913452, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.21333333333333335, + "grad_norm": 9.33584965819257, + "kl": 0.4361572265625, + "learning_rate": 4.927089014777972e-07, + "loss": 0.0004, + "num_tokens": 8994330.0, + "reward": 0.28125, + "reward_std": 0.24258936941623688, + "rewards/equation_reward_func/mean": 0.1953125, + "rewards/equation_reward_func/std": 0.3979988098144531, + "rewards/format_reward_func/mean": 0.3671875, + "rewards/format_reward_func/std": 0.4839322865009308, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.21688888888888888, + "grad_norm": 6.303532871380138, + "kl": 0.6859130859375, + "learning_rate": 4.923593733946614e-07, + "loss": 0.0007, + "num_tokens": 9144434.0, + "reward": 0.296875, + "reward_std": 0.21597611904144287, + "rewards/equation_reward_func/mean": 0.171875, + "rewards/equation_reward_func/std": 0.3787541687488556, + "rewards/format_reward_func/mean": 0.421875, + "rewards/format_reward_func/std": 0.4957992732524872, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.22044444444444444, + "grad_norm": 8.57852289803169, + "kl": 1.74853515625, + "learning_rate": 4.920017921694841e-07, + "loss": 0.0018, + "num_tokens": 9294478.0, + "reward": 0.2890625, + "reward_std": 0.20475518703460693, + "rewards/equation_reward_func/mean": 0.2109375, + "rewards/equation_reward_func/std": 0.4095771610736847, + "rewards/format_reward_func/mean": 0.3671875, + "rewards/format_reward_func/std": 0.4839322865009308, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.224, + "grad_norm": 3.8660166772097195, + "kl": 0.3358154296875, + "learning_rate": 4.91636169684011e-07, + "loss": 0.0003, + "num_tokens": 9444610.0, + "reward": 0.2734375, + "reward_std": 0.23609855771064758, + "rewards/equation_reward_func/mean": 0.1640625, + "rewards/equation_reward_func/std": 0.371787428855896, + "rewards/format_reward_func/mean": 0.3828125, + "rewards/format_reward_func/std": 0.4879830479621887, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.22755555555555557, + "grad_norm": 12.93870306614523, + "kl": 1.1669921875, + "learning_rate": 4.912625180871833e-07, + "loss": 0.0012, + "num_tokens": 9594650.0, + "reward": 0.25390625, + "reward_std": 0.23785054683685303, + "rewards/equation_reward_func/mean": 0.1328125, + "rewards/equation_reward_func/std": 0.3407054841518402, + "rewards/format_reward_func/mean": 0.375, + "rewards/format_reward_func/std": 0.4860251843929291, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2311111111111111, + "grad_norm": 7.511898616426601, + "kl": 0.55322265625, + "learning_rate": 4.908808497947346e-07, + "loss": 0.0006, + "num_tokens": 9744694.0, + "reward": 0.33984375, + "reward_std": 0.27659574151039124, + "rewards/equation_reward_func/mean": 0.234375, + "rewards/equation_reward_func/std": 0.42527204751968384, + "rewards/format_reward_func/mean": 0.4453125, + "rewards/format_reward_func/std": 0.4989531338214874, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.23466666666666666, + "grad_norm": 1.8438447972930012, + "kl": 0.2886962890625, + "learning_rate": 4.904911774887779e-07, + "loss": 0.0003, + "num_tokens": 9894774.0, + "reward": 0.3515625, + "reward_std": 0.24986067414283752, + "rewards/equation_reward_func/mean": 0.2890625, + "rewards/equation_reward_func/std": 0.45510825514793396, + "rewards/format_reward_func/mean": 0.4140625, + "rewards/format_reward_func/std": 0.49449479579925537, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9921875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 1021.21875, + "completions/mean_terminated_length": 668.0, + "completions/min_length": 668.0, + "completions/min_terminated_length": 668.0, + "epoch": 0.23822222222222222, + "grad_norm": 1.4441757047110426, + "kl": 0.230224609375, + "learning_rate": 4.900935141173842e-07, + "loss": 0.0016, + "num_tokens": 10044558.0, + "reward": 0.234375, + "reward_std": 0.21178939938545227, + "rewards/equation_reward_func/mean": 0.1328125, + "rewards/equation_reward_func/std": 0.3407054841518402, + "rewards/format_reward_func/mean": 0.3359375, + "rewards/format_reward_func/std": 0.47417303919792175, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24177777777777779, + "grad_norm": 1.6691952805088401, + "kl": 0.329833984375, + "learning_rate": 4.896878728941531e-07, + "loss": 0.0003, + "num_tokens": 10194698.0, + "reward": 0.203125, + "reward_std": 0.20396818220615387, + "rewards/equation_reward_func/mean": 0.1015625, + "rewards/equation_reward_func/std": 0.3032590448856354, + "rewards/format_reward_func/mean": 0.3046875, + "rewards/format_reward_func/std": 0.46208351850509644, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24533333333333332, + "grad_norm": 13.720929458457615, + "kl": 3.0303955078125, + "learning_rate": 4.892742672977722e-07, + "loss": 0.003, + "num_tokens": 10344786.0, + "reward": 0.3203125, + "reward_std": 0.25942516326904297, + "rewards/equation_reward_func/mean": 0.21875, + "rewards/equation_reward_func/std": 0.41502299904823303, + "rewards/format_reward_func/mean": 0.421875, + "rewards/format_reward_func/std": 0.4957992732524872, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.24888888888888888, + "grad_norm": 2.4164360870318733, + "kl": 0.5142822265625, + "learning_rate": 4.888527110715709e-07, + "loss": 0.0005, + "num_tokens": 10494850.0, + "reward": 0.29296875, + "reward_std": 0.2201562374830246, + "rewards/equation_reward_func/mean": 0.2109375, + "rewards/equation_reward_func/std": 0.4095771610736847, + "rewards/format_reward_func/mean": 0.375, + "rewards/format_reward_func/std": 0.4860251843929291, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9921875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 552.0, + "completions/mean_length": 1020.3125, + "completions/mean_terminated_length": 552.0, + "completions/min_length": 552.0, + "completions/min_terminated_length": 552.0, + "epoch": 0.25244444444444447, + "grad_norm": 1.7293321306646259, + "kl": 0.32080078125, + "learning_rate": 4.884232182230623e-07, + "loss": -0.003, + "num_tokens": 10644490.0, + "reward": 0.234375, + "reward_std": 0.23798327147960663, + "rewards/equation_reward_func/mean": 0.140625, + "rewards/equation_reward_func/std": 0.3490002751350403, + "rewards/format_reward_func/mean": 0.328125, + "rewards/format_reward_func/std": 0.4713755249977112, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.256, + "grad_norm": 1.115711151199567, + "kl": 0.373046875, + "learning_rate": 4.879858030234789e-07, + "loss": 0.0004, + "num_tokens": 10794594.0, + "reward": 0.28125, + "reward_std": 0.17779618501663208, + "rewards/equation_reward_func/mean": 0.21875, + "rewards/equation_reward_func/std": 0.41502299904823303, + "rewards/format_reward_func/mean": 0.34375, + "rewards/format_reward_func/std": 0.47682511806488037, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.25955555555555554, + "grad_norm": 24.35412756965178, + "kl": 2.08587646484375, + "learning_rate": 4.875404800072976e-07, + "loss": 0.0021, + "num_tokens": 10944666.0, + "reward": 0.2109375, + "reward_std": 0.2127828449010849, + "rewards/equation_reward_func/mean": 0.109375, + "rewards/equation_reward_func/std": 0.31333550810813904, + "rewards/format_reward_func/mean": 0.3125, + "rewards/format_reward_func/std": 0.4653336703777313, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26311111111111113, + "grad_norm": 1554.2762073710478, + "kl": 112.7879638671875, + "learning_rate": 4.870872639717572e-07, + "loss": 0.1126, + "num_tokens": 11094806.0, + "reward": 0.17578125, + "reward_std": 0.2021140456199646, + "rewards/equation_reward_func/mean": 0.0703125, + "rewards/equation_reward_func/std": 0.2566775679588318, + "rewards/format_reward_func/mean": 0.28125, + "rewards/format_reward_func/std": 0.4513758420944214, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.26666666666666666, + "grad_norm": 1.1410728341145782, + "kl": 0.2967529296875, + "learning_rate": 4.866261699763664e-07, + "loss": 0.0003, + "num_tokens": 11244874.0, + "reward": 0.24609375, + "reward_std": 0.2407177835702896, + "rewards/equation_reward_func/mean": 0.1875, + "rewards/equation_reward_func/std": 0.39184603095054626, + "rewards/format_reward_func/mean": 0.3046875, + "rewards/format_reward_func/std": 0.46208351850509644, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2702222222222222, + "grad_norm": 6.430383302085185, + "kl": 1.0128173828125, + "learning_rate": 4.861572133424035e-07, + "loss": 0.001, + "num_tokens": 11394958.0, + "reward": 0.28515625, + "reward_std": 0.2505149245262146, + "rewards/equation_reward_func/mean": 0.1953125, + "rewards/equation_reward_func/std": 0.3979988098144531, + "rewards/format_reward_func/mean": 0.375, + "rewards/format_reward_func/std": 0.4860251843929291, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2737777777777778, + "grad_norm": 7.0576042073325, + "kl": 1.8004150390625, + "learning_rate": 4.856804096524078e-07, + "loss": 0.0018, + "num_tokens": 11545110.0, + "reward": 0.22265625, + "reward_std": 0.16942934691905975, + "rewards/equation_reward_func/mean": 0.1484375, + "rewards/equation_reward_func/std": 0.356930136680603, + "rewards/format_reward_func/mean": 0.296875, + "rewards/format_reward_func/std": 0.45867621898651123, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2773333333333333, + "grad_norm": 56.11950857705648, + "kl": 7.319580078125, + "learning_rate": 4.851957747496606e-07, + "loss": 0.0073, + "num_tokens": 11695202.0, + "reward": 0.1953125, + "reward_std": 0.19627749919891357, + "rewards/equation_reward_func/mean": 0.1484375, + "rewards/equation_reward_func/std": 0.356930136680603, + "rewards/format_reward_func/mean": 0.2421875, + "rewards/format_reward_func/std": 0.4300905168056488, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2808888888888889, + "grad_norm": 1.1894403579564377, + "kl": 0.2705078125, + "learning_rate": 4.847033247376605e-07, + "loss": 0.0003, + "num_tokens": 11845266.0, + "reward": 0.16796875, + "reward_std": 0.20112939178943634, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.2421875, + "rewards/format_reward_func/std": 0.4300905168056488, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.28444444444444444, + "grad_norm": 15.58649087673573, + "kl": 5.33154296875, + "learning_rate": 4.842030759795866e-07, + "loss": 0.0053, + "num_tokens": 11995326.0, + "reward": 0.20703125, + "reward_std": 0.19561228156089783, + "rewards/equation_reward_func/mean": 0.125, + "rewards/equation_reward_func/std": 0.3320184051990509, + "rewards/format_reward_func/mean": 0.2890625, + "rewards/format_reward_func/std": 0.45510825514793396, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.288, + "grad_norm": 1.2211524541098628, + "kl": 0.345703125, + "learning_rate": 4.836950450977558e-07, + "loss": 0.0003, + "num_tokens": 12145330.0, + "reward": 0.28515625, + "reward_std": 0.2513952851295471, + "rewards/equation_reward_func/mean": 0.2265625, + "rewards/equation_reward_func/std": 0.4202519655227661, + "rewards/format_reward_func/mean": 0.34375, + "rewards/format_reward_func/std": 0.47682511806488037, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.29155555555555557, + "grad_norm": 11.595035432688796, + "kl": 0.9814453125, + "learning_rate": 4.831792489730703e-07, + "loss": 0.001, + "num_tokens": 12295462.0, + "reward": 0.171875, + "reward_std": 0.16823168098926544, + "rewards/equation_reward_func/mean": 0.1484375, + "rewards/equation_reward_func/std": 0.356930136680603, + "rewards/format_reward_func/mean": 0.1953125, + "rewards/format_reward_func/std": 0.3979988098144531, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2951111111111111, + "grad_norm": 1.1924550929402917, + "kl": 0.3919677734375, + "learning_rate": 4.826557047444563e-07, + "loss": 0.0004, + "num_tokens": 12445498.0, + "reward": 0.14453125, + "reward_std": 0.18077430129051208, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.1953125, + "rewards/format_reward_func/std": 0.3979988098144531, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.2986666666666667, + "grad_norm": 21.230632420638482, + "kl": 3.0062255859375, + "learning_rate": 4.821244298082951e-07, + "loss": 0.003, + "num_tokens": 12595542.0, + "reward": 0.22265625, + "reward_std": 0.22652746737003326, + "rewards/equation_reward_func/mean": 0.171875, + "rewards/equation_reward_func/std": 0.3787541687488556, + "rewards/format_reward_func/mean": 0.2734375, + "rewards/format_reward_func/std": 0.447474867105484, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3022222222222222, + "grad_norm": 1.0369057223186056, + "kl": 0.2333984375, + "learning_rate": 4.815854418178445e-07, + "loss": 0.0002, + "num_tokens": 12745614.0, + "reward": 0.19140625, + "reward_std": 0.20221619307994843, + "rewards/equation_reward_func/mean": 0.125, + "rewards/equation_reward_func/std": 0.3320184051990509, + "rewards/format_reward_func/mean": 0.2578125, + "rewards/format_reward_func/std": 0.43914902210235596, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.30577777777777776, + "grad_norm": 1.0278344273119406, + "kl": 0.1983642578125, + "learning_rate": 4.810387586826527e-07, + "loss": 0.0002, + "num_tokens": 12895698.0, + "reward": 0.2734375, + "reward_std": 0.2464390993118286, + "rewards/equation_reward_func/mean": 0.203125, + "rewards/equation_reward_func/std": 0.40390563011169434, + "rewards/format_reward_func/mean": 0.34375, + "rewards/format_reward_func/std": 0.47682511806488037, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.30933333333333335, + "grad_norm": 9.38166565142255, + "kl": 1.9534912109375, + "learning_rate": 4.804843985679626e-07, + "loss": 0.002, + "num_tokens": 13045762.0, + "reward": 0.171875, + "reward_std": 0.2152000367641449, + "rewards/equation_reward_func/mean": 0.1328125, + "rewards/equation_reward_func/std": 0.3407054841518402, + "rewards/format_reward_func/mean": 0.2109375, + "rewards/format_reward_func/std": 0.4095771610736847, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3128888888888889, + "grad_norm": 1.1768601924784283, + "kl": 0.3209228515625, + "learning_rate": 4.799223798941089e-07, + "loss": 0.0003, + "num_tokens": 13195926.0, + "reward": 0.16015625, + "reward_std": 0.18319149315357208, + "rewards/equation_reward_func/mean": 0.1015625, + "rewards/equation_reward_func/std": 0.3032590448856354, + "rewards/format_reward_func/mean": 0.21875, + "rewards/format_reward_func/std": 0.41502299904823303, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3164444444444444, + "grad_norm": 0.6415947722993769, + "kl": 0.1815185546875, + "learning_rate": 4.793527213359058e-07, + "loss": 0.0002, + "num_tokens": 13346058.0, + "reward": 0.1015625, + "reward_std": 0.1392945945262909, + "rewards/equation_reward_func/mean": 0.078125, + "rewards/equation_reward_func/std": 0.2694226801395416, + "rewards/format_reward_func/mean": 0.125, + "rewards/format_reward_func/std": 0.3320184051990509, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.32, + "grad_norm": 1.1209935354079488, + "kl": 0.1925048828125, + "learning_rate": 4.787754418220257e-07, + "loss": 0.0002, + "num_tokens": 13496054.0, + "reward": 0.171875, + "reward_std": 0.21861067414283752, + "rewards/equation_reward_func/mean": 0.1171875, + "rewards/equation_reward_func/std": 0.322907418012619, + "rewards/format_reward_func/mean": 0.2265625, + "rewards/format_reward_func/std": 0.4202519655227661, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.32355555555555554, + "grad_norm": 1.1350649820782601, + "kl": 0.2437744140625, + "learning_rate": 4.781905605343716e-07, + "loss": 0.0002, + "num_tokens": 13646078.0, + "reward": 0.16796875, + "reward_std": 0.1717531979084015, + "rewards/equation_reward_func/mean": 0.1015625, + "rewards/equation_reward_func/std": 0.3032590448856354, + "rewards/format_reward_func/mean": 0.234375, + "rewards/format_reward_func/std": 0.42527204751968384, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.32711111111111113, + "grad_norm": 0.8036131940570752, + "kl": 0.2301025390625, + "learning_rate": 4.775980969074385e-07, + "loss": 0.0002, + "num_tokens": 13796150.0, + "reward": 0.17578125, + "reward_std": 0.1618429571390152, + "rewards/equation_reward_func/mean": 0.1484375, + "rewards/equation_reward_func/std": 0.356930136680603, + "rewards/format_reward_func/mean": 0.203125, + "rewards/format_reward_func/std": 0.40390563011169434, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.33066666666666666, + "grad_norm": 2.5669796312022144, + "kl": 0.4324951171875, + "learning_rate": 4.769980706276687e-07, + "loss": 0.0004, + "num_tokens": 13946262.0, + "reward": 0.203125, + "reward_std": 0.2259906381368637, + "rewards/equation_reward_func/mean": 0.1484375, + "rewards/equation_reward_func/std": 0.356930136680603, + "rewards/format_reward_func/mean": 0.2578125, + "rewards/format_reward_func/std": 0.43914902210235596, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3342222222222222, + "grad_norm": 1.5566319365843342, + "kl": 0.2623291015625, + "learning_rate": 4.7639050163279646e-07, + "loss": 0.0003, + "num_tokens": 14096386.0, + "reward": 0.14453125, + "reward_std": 0.1585453897714615, + "rewards/equation_reward_func/mean": 0.109375, + "rewards/equation_reward_func/std": 0.31333550810813904, + "rewards/format_reward_func/mean": 0.1796875, + "rewards/format_reward_func/std": 0.3854354918003082, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3377777777777778, + "grad_norm": 0.9827466599698991, + "kl": 0.2091064453125, + "learning_rate": 4.757754101111867e-07, + "loss": 0.0002, + "num_tokens": 14246454.0, + "reward": 0.2109375, + "reward_std": 0.2073678970336914, + "rewards/equation_reward_func/mean": 0.1484375, + "rewards/equation_reward_func/std": 0.356930136680603, + "rewards/format_reward_func/mean": 0.2734375, + "rewards/format_reward_func/std": 0.447474867105484, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3413333333333333, + "grad_norm": 0.8474200819506911, + "kl": 0.1885986328125, + "learning_rate": 4.751528165011633e-07, + "loss": 0.0002, + "num_tokens": 14396482.0, + "reward": 0.1953125, + "reward_std": 0.19494709372520447, + "rewards/equation_reward_func/mean": 0.140625, + "rewards/equation_reward_func/std": 0.3490002751350403, + "rewards/format_reward_func/mean": 0.25, + "rewards/format_reward_func/std": 0.434714138507843, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3448888888888889, + "grad_norm": 0.8935139225334848, + "kl": 0.19091796875, + "learning_rate": 4.7452274149033036e-07, + "loss": 0.0002, + "num_tokens": 14546506.0, + "reward": 0.22265625, + "reward_std": 0.1896713674068451, + "rewards/equation_reward_func/mean": 0.171875, + "rewards/equation_reward_func/std": 0.3787541687488556, + "rewards/format_reward_func/mean": 0.2734375, + "rewards/format_reward_func/std": 0.447474867105484, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.34844444444444445, + "grad_norm": 0.8908823710522052, + "kl": 0.1903076171875, + "learning_rate": 4.738852060148848e-07, + "loss": 0.0002, + "num_tokens": 14696570.0, + "reward": 0.16015625, + "reward_std": 0.17767438292503357, + "rewards/equation_reward_func/mean": 0.125, + "rewards/equation_reward_func/std": 0.3320184051990509, + "rewards/format_reward_func/mean": 0.1953125, + "rewards/format_reward_func/std": 0.3979988098144531, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.352, + "grad_norm": 0.8940407168561494, + "kl": 0.15765380859375, + "learning_rate": 4.7324023125892067e-07, + "loss": 0.0002, + "num_tokens": 14846678.0, + "reward": 0.13671875, + "reward_std": 0.16569268703460693, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.2109375, + "rewards/format_reward_func/std": 0.4095771610736847, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.35555555555555557, + "grad_norm": 1.0372146373302493, + "kl": 0.2196044921875, + "learning_rate": 4.7258783865372496e-07, + "loss": 0.0002, + "num_tokens": 14996674.0, + "reward": 0.296875, + "reward_std": 0.29111427068710327, + "rewards/equation_reward_func/mean": 0.2421875, + "rewards/equation_reward_func/std": 0.4300905168056488, + "rewards/format_reward_func/mean": 0.3515625, + "rewards/format_reward_func/std": 0.4793342351913452, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3591111111111111, + "grad_norm": 3.8175886041475904, + "kl": 0.525634765625, + "learning_rate": 4.719280498770659e-07, + "loss": 0.0005, + "num_tokens": 15146770.0, + "reward": 0.265625, + "reward_std": 0.1947406530380249, + "rewards/equation_reward_func/mean": 0.234375, + "rewards/equation_reward_func/std": 0.42527204751968384, + "rewards/format_reward_func/mean": 0.296875, + "rewards/format_reward_func/std": 0.45867621898651123, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3626666666666667, + "grad_norm": 1.7543264789310198, + "kl": 0.295166015625, + "learning_rate": 4.712608868524726e-07, + "loss": 0.0003, + "num_tokens": 15296818.0, + "reward": 0.1796875, + "reward_std": 0.18956050276756287, + "rewards/equation_reward_func/mean": 0.078125, + "rewards/equation_reward_func/std": 0.2694226801395416, + "rewards/format_reward_func/mean": 0.28125, + "rewards/format_reward_func/std": 0.4513758420944214, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3662222222222222, + "grad_norm": 1.0227525620642117, + "kl": 0.20050048828125, + "learning_rate": 4.70586371748506e-07, + "loss": 0.0002, + "num_tokens": 15446870.0, + "reward": 0.2578125, + "reward_std": 0.19561007618904114, + "rewards/equation_reward_func/mean": 0.21875, + "rewards/equation_reward_func/std": 0.41502299904823303, + "rewards/format_reward_func/mean": 0.296875, + "rewards/format_reward_func/std": 0.45867621898651123, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.36977777777777776, + "grad_norm": 0.9640594800874314, + "kl": 0.2506103515625, + "learning_rate": 4.699045269780232e-07, + "loss": 0.0003, + "num_tokens": 15596974.0, + "reward": 0.25390625, + "reward_std": 0.21409572660923004, + "rewards/equation_reward_func/mean": 0.1875, + "rewards/equation_reward_func/std": 0.39184603095054626, + "rewards/format_reward_func/mean": 0.3203125, + "rewards/format_reward_func/std": 0.4684300124645233, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.37333333333333335, + "grad_norm": 0.9924489289475691, + "kl": 0.281982421875, + "learning_rate": 4.692153751974318e-07, + "loss": 0.0003, + "num_tokens": 15747106.0, + "reward": 0.2421875, + "reward_std": 0.1833132803440094, + "rewards/equation_reward_func/mean": 0.15625, + "rewards/equation_reward_func/std": 0.3645188808441162, + "rewards/format_reward_func/mean": 0.328125, + "rewards/format_reward_func/std": 0.4713755249977112, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3768888888888889, + "grad_norm": 1.1958906934430737, + "kl": 0.2791748046875, + "learning_rate": 4.685189393059377e-07, + "loss": 0.0003, + "num_tokens": 15897182.0, + "reward": 0.28515625, + "reward_std": 0.2538124918937683, + "rewards/equation_reward_func/mean": 0.1171875, + "rewards/equation_reward_func/std": 0.322907418012619, + "rewards/format_reward_func/mean": 0.453125, + "rewards/format_reward_func/std": 0.4997538626194, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3804444444444444, + "grad_norm": 4.660458310479511, + "kl": 0.49755859375, + "learning_rate": 4.6781524244478374e-07, + "loss": 0.0005, + "num_tokens": 16047302.0, + "reward": 0.31640625, + "reward_std": 0.23313136398792267, + "rewards/equation_reward_func/mean": 0.1640625, + "rewards/equation_reward_func/std": 0.371787428855896, + "rewards/format_reward_func/mean": 0.46875, + "rewards/format_reward_func/std": 0.5009832978248596, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.384, + "grad_norm": 1.2324512257183518, + "kl": 0.3973388671875, + "learning_rate": 4.6710430799648143e-07, + "loss": 0.0004, + "num_tokens": 16197302.0, + "reward": 0.3125, + "reward_std": 0.19484493136405945, + "rewards/equation_reward_func/mean": 0.1484375, + "rewards/equation_reward_func/std": 0.356930136680603, + "rewards/format_reward_func/mean": 0.4765625, + "rewards/format_reward_func/std": 0.5014128684997559, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.38755555555555554, + "grad_norm": 1.2549179396404038, + "kl": 0.2296142578125, + "learning_rate": 4.663861595840332e-07, + "loss": 0.0002, + "num_tokens": 16347306.0, + "reward": 0.390625, + "reward_std": 0.24787381291389465, + "rewards/equation_reward_func/mean": 0.1875, + "rewards/equation_reward_func/std": 0.39184603095054626, + "rewards/format_reward_func/mean": 0.59375, + "rewards/format_reward_func/std": 0.4930621087551117, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39111111111111113, + "grad_norm": 1.078204553596969, + "kl": 0.3565673828125, + "learning_rate": 4.6566082107014795e-07, + "loss": 0.0004, + "num_tokens": 16497418.0, + "reward": 0.27734375, + "reward_std": 0.22730353474617004, + "rewards/equation_reward_func/mean": 0.1171875, + "rewards/equation_reward_func/std": 0.322907418012619, + "rewards/format_reward_func/mean": 0.4375, + "rewards/format_reward_func/std": 0.49802759289741516, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.39466666666666667, + "grad_norm": 13.095496410143678, + "kl": 1.576416015625, + "learning_rate": 4.649283165564479e-07, + "loss": 0.0016, + "num_tokens": 16647462.0, + "reward": 0.30859375, + "reward_std": 0.19462978839874268, + "rewards/equation_reward_func/mean": 0.125, + "rewards/equation_reward_func/std": 0.3320184051990509, + "rewards/format_reward_func/mean": 0.4921875, + "rewards/format_reward_func/std": 0.5019033551216125, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.3982222222222222, + "grad_norm": 1.1570771083116458, + "kl": 0.3052978515625, + "learning_rate": 4.6418867038266807e-07, + "loss": 0.0003, + "num_tokens": 16797554.0, + "reward": 0.3046875, + "reward_std": 0.227077454328537, + "rewards/equation_reward_func/mean": 0.140625, + "rewards/equation_reward_func/std": 0.3490002751350403, + "rewards/format_reward_func/mean": 0.46875, + "rewards/format_reward_func/std": 0.5009832978248596, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4017777777777778, + "grad_norm": 1.2112330269604161, + "kl": 0.267578125, + "learning_rate": 4.6344190712584713e-07, + "loss": 0.0003, + "num_tokens": 16947698.0, + "reward": 0.3671875, + "reward_std": 0.2886773347854614, + "rewards/equation_reward_func/mean": 0.1796875, + "rewards/equation_reward_func/std": 0.3854354918003082, + "rewards/format_reward_func/mean": 0.5546875, + "rewards/format_reward_func/std": 0.4989531338214874, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4053333333333333, + "grad_norm": 1.0237828322135445, + "kl": 0.248291015625, + "learning_rate": 4.6268805159951086e-07, + "loss": 0.0002, + "num_tokens": 17097782.0, + "reward": 0.33984375, + "reward_std": 0.22488634288311005, + "rewards/equation_reward_func/mean": 0.1328125, + "rewards/equation_reward_func/std": 0.3407054841518402, + "rewards/format_reward_func/mean": 0.546875, + "rewards/format_reward_func/std": 0.4997538626194, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4088888888888889, + "grad_norm": 3.3025894103980704, + "kl": 0.5584716796875, + "learning_rate": 4.619271288528478e-07, + "loss": 0.0006, + "num_tokens": 17247882.0, + "reward": 0.27734375, + "reward_std": 0.2406156361103058, + "rewards/equation_reward_func/mean": 0.1015625, + "rewards/equation_reward_func/std": 0.3032590448856354, + "rewards/format_reward_func/mean": 0.453125, + "rewards/format_reward_func/std": 0.4997538626194, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41244444444444445, + "grad_norm": 4.1382469339470225, + "kl": 0.8095703125, + "learning_rate": 4.611591641698768e-07, + "loss": 0.0008, + "num_tokens": 17397994.0, + "reward": 0.28125, + "reward_std": 0.1947515904903412, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.46875, + "rewards/format_reward_func/std": 0.5009832978248596, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.416, + "grad_norm": 15.502900105571765, + "kl": 3.479736328125, + "learning_rate": 4.6038418306860695e-07, + "loss": 0.0035, + "num_tokens": 17548122.0, + "reward": 0.33203125, + "reward_std": 0.2296164482831955, + "rewards/equation_reward_func/mean": 0.1015625, + "rewards/equation_reward_func/std": 0.3032590448856354, + "rewards/format_reward_func/mean": 0.5625, + "rewards/format_reward_func/std": 0.49802759289741516, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.41955555555555557, + "grad_norm": 1.2758988778808227, + "kl": 0.275390625, + "learning_rate": 4.596022113001894e-07, + "loss": 0.0003, + "num_tokens": 17698266.0, + "reward": 0.375, + "reward_std": 0.203322634100914, + "rewards/equation_reward_func/mean": 0.1875, + "rewards/equation_reward_func/std": 0.39184603095054626, + "rewards/format_reward_func/mean": 0.5625, + "rewards/format_reward_func/std": 0.49802759289741516, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4231111111111111, + "grad_norm": 1.1770866594193163, + "kl": 0.3016357421875, + "learning_rate": 4.58813274848062e-07, + "loss": 0.0003, + "num_tokens": 17848314.0, + "reward": 0.37890625, + "reward_std": 0.23390743136405945, + "rewards/equation_reward_func/mean": 0.1953125, + "rewards/equation_reward_func/std": 0.3979988098144531, + "rewards/format_reward_func/mean": 0.5625, + "rewards/format_reward_func/std": 0.49802759289741516, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4266666666666667, + "grad_norm": 2.4545856054451156, + "kl": 0.7242431640625, + "learning_rate": 4.5801739992708604e-07, + "loss": 0.0007, + "num_tokens": 17998410.0, + "reward": 0.31640625, + "reward_std": 0.208717942237854, + "rewards/equation_reward_func/mean": 0.125, + "rewards/equation_reward_func/std": 0.3320184051990509, + "rewards/format_reward_func/mean": 0.5078125, + "rewards/format_reward_func/std": 0.5019033551216125, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.43022222222222223, + "grad_norm": 1.4734406735109156, + "kl": 0.3726806640625, + "learning_rate": 4.572146129826746e-07, + "loss": 0.0004, + "num_tokens": 18148458.0, + "reward": 0.46875, + "reward_std": 0.23896577954292297, + "rewards/equation_reward_func/mean": 0.2421875, + "rewards/equation_reward_func/std": 0.4300905168056488, + "rewards/format_reward_func/mean": 0.6953125, + "rewards/format_reward_func/std": 0.46208351850509644, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.43377777777777776, + "grad_norm": 5.758478512827765, + "kl": 1.009033203125, + "learning_rate": 4.5640494068991454e-07, + "loss": 0.001, + "num_tokens": 18298598.0, + "reward": 0.32421875, + "reward_std": 0.20069028437137604, + "rewards/equation_reward_func/mean": 0.125, + "rewards/equation_reward_func/std": 0.3320184051990509, + "rewards/format_reward_func/mean": 0.5234375, + "rewards/format_reward_func/std": 0.5014128684997559, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.43733333333333335, + "grad_norm": 1.1248450015614464, + "kl": 0.288330078125, + "learning_rate": 4.555884099526793e-07, + "loss": 0.0003, + "num_tokens": 18448610.0, + "reward": 0.40234375, + "reward_std": 0.20794187486171722, + "rewards/equation_reward_func/mean": 0.1875, + "rewards/equation_reward_func/std": 0.39184603095054626, + "rewards/format_reward_func/mean": 0.6171875, + "rewards/format_reward_func/std": 0.4879830479621887, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4408888888888889, + "grad_norm": 1.117208300016219, + "kl": 0.271728515625, + "learning_rate": 4.547650479027361e-07, + "loss": 0.0003, + "num_tokens": 18598694.0, + "reward": 0.41015625, + "reward_std": 0.20739847421646118, + "rewards/equation_reward_func/mean": 0.2109375, + "rewards/equation_reward_func/std": 0.4095771610736847, + "rewards/format_reward_func/mean": 0.609375, + "rewards/format_reward_func/std": 0.4898075461387634, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4444444444444444, + "grad_norm": 1.212716720763158, + "kl": 0.28369140625, + "learning_rate": 4.53934881898843e-07, + "loss": 0.0003, + "num_tokens": 18748774.0, + "reward": 0.3828125, + "reward_std": 0.2037726789712906, + "rewards/equation_reward_func/mean": 0.140625, + "rewards/equation_reward_func/std": 0.3490002751350403, + "rewards/format_reward_func/mean": 0.625, + "rewards/format_reward_func/std": 0.4860251843929291, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.448, + "grad_norm": 1.4912885010455994, + "kl": 0.28466796875, + "learning_rate": 4.5309793952584095e-07, + "loss": 0.0003, + "num_tokens": 18898894.0, + "reward": 0.30078125, + "reward_std": 0.2230125367641449, + "rewards/equation_reward_func/mean": 0.109375, + "rewards/equation_reward_func/std": 0.31333550810813904, + "rewards/format_reward_func/mean": 0.4921875, + "rewards/format_reward_func/std": 0.5019033551216125, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.45155555555555554, + "grad_norm": 1.6085493649140328, + "kl": 0.369873046875, + "learning_rate": 4.5225424859373684e-07, + "loss": 0.0004, + "num_tokens": 19048970.0, + "reward": 0.35546875, + "reward_std": 0.21158519387245178, + "rewards/equation_reward_func/mean": 0.0859375, + "rewards/equation_reward_func/std": 0.2813730239868164, + "rewards/format_reward_func/mean": 0.625, + "rewards/format_reward_func/std": 0.4860251843929291, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.45511111111111113, + "grad_norm": 8.1136624800545, + "kl": 1.5572509765625, + "learning_rate": 4.514038371367791e-07, + "loss": 0.0016, + "num_tokens": 19199078.0, + "reward": 0.42578125, + "reward_std": 0.19231687486171722, + "rewards/equation_reward_func/mean": 0.15625, + "rewards/equation_reward_func/std": 0.3645188808441162, + "rewards/format_reward_func/mean": 0.6953125, + "rewards/format_reward_func/std": 0.46208351850509644, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.45866666666666667, + "grad_norm": 1.4312492558258578, + "kl": 0.3763427734375, + "learning_rate": 4.5054673341252657e-07, + "loss": 0.0004, + "num_tokens": 19349186.0, + "reward": 0.34765625, + "reward_std": 0.1864890456199646, + "rewards/equation_reward_func/mean": 0.109375, + "rewards/equation_reward_func/std": 0.31333550810813904, + "rewards/format_reward_func/mean": 0.5859375, + "rewards/format_reward_func/std": 0.49449479579925537, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4622222222222222, + "grad_norm": 2.962105238864389, + "kl": 0.706787109375, + "learning_rate": 4.496829659009095e-07, + "loss": 0.0007, + "num_tokens": 19499346.0, + "reward": 0.34765625, + "reward_std": 0.18802587687969208, + "rewards/equation_reward_func/mean": 0.1015625, + "rewards/equation_reward_func/std": 0.3032590448856354, + "rewards/format_reward_func/mean": 0.59375, + "rewards/format_reward_func/std": 0.4930621087551117, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4657777777777778, + "grad_norm": 354.1491510027472, + "kl": 56.3818359375, + "learning_rate": 4.488125633032831e-07, + "loss": 0.0565, + "num_tokens": 19649510.0, + "reward": 0.33984375, + "reward_std": 0.21212857961654663, + "rewards/equation_reward_func/mean": 0.1015625, + "rewards/equation_reward_func/std": 0.3032590448856354, + "rewards/format_reward_func/mean": 0.578125, + "rewards/format_reward_func/std": 0.4957992732524872, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4693333333333333, + "grad_norm": 174.51099175867512, + "kl": 9.54150390625, + "learning_rate": 4.479355545414738e-07, + "loss": 0.0096, + "num_tokens": 19799590.0, + "reward": 0.41796875, + "reward_std": 0.19462977349758148, + "rewards/equation_reward_func/mean": 0.171875, + "rewards/equation_reward_func/std": 0.3787541687488556, + "rewards/format_reward_func/mean": 0.6640625, + "rewards/format_reward_func/std": 0.47417303919792175, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4728888888888889, + "grad_norm": 1.397870689490664, + "kl": 0.516845703125, + "learning_rate": 4.470519687568185e-07, + "loss": 0.0005, + "num_tokens": 19949690.0, + "reward": 0.38671875, + "reward_std": 0.18506528437137604, + "rewards/equation_reward_func/mean": 0.1640625, + "rewards/equation_reward_func/std": 0.371787428855896, + "rewards/format_reward_func/mean": 0.609375, + "rewards/format_reward_func/std": 0.4898075461387634, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.47644444444444445, + "grad_norm": 1.0641630797189836, + "kl": 0.30322265625, + "learning_rate": 4.4616183530919604e-07, + "loss": 0.0003, + "num_tokens": 20099654.0, + "reward": 0.4609375, + "reward_std": 0.17670938372612, + "rewards/equation_reward_func/mean": 0.1875, + "rewards/equation_reward_func/std": 0.39184603095054626, + "rewards/format_reward_func/mean": 0.734375, + "rewards/format_reward_func/std": 0.44340085983276367, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48, + "grad_norm": 6.158216503528497, + "kl": 1.213623046875, + "learning_rate": 4.452651837760515e-07, + "loss": 0.0012, + "num_tokens": 20249794.0, + "reward": 0.42578125, + "reward_std": 0.17373128235340118, + "rewards/equation_reward_func/mean": 0.1953125, + "rewards/equation_reward_func/std": 0.3979988098144531, + "rewards/format_reward_func/mean": 0.65625, + "rewards/format_reward_func/std": 0.47682511806488037, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.48355555555555557, + "grad_norm": 1.3444116770940566, + "kl": 0.523681640625, + "learning_rate": 4.443620439514138e-07, + "loss": 0.0005, + "num_tokens": 20399882.0, + "reward": 0.3984375, + "reward_std": 0.16162778437137604, + "rewards/equation_reward_func/mean": 0.1015625, + "rewards/equation_reward_func/std": 0.3032590448856354, + "rewards/format_reward_func/mean": 0.6953125, + "rewards/format_reward_func/std": 0.46208351850509644, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.4871111111111111, + "grad_norm": 1.531233557438578, + "kl": 0.55908203125, + "learning_rate": 4.4345244584490535e-07, + "loss": 0.0006, + "num_tokens": 20550006.0, + "reward": 0.48046875, + "reward_std": 0.21245458722114563, + "rewards/equation_reward_func/mean": 0.28125, + "rewards/equation_reward_func/std": 0.4513758420944214, + "rewards/format_reward_func/mean": 0.6796875, + "rewards/format_reward_func/std": 0.4684300124645233, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49066666666666664, + "grad_norm": 7.99434662228326, + "kl": 1.595458984375, + "learning_rate": 4.4253641968074505e-07, + "loss": 0.0016, + "num_tokens": 20700002.0, + "reward": 0.43359375, + "reward_std": 0.20639410614967346, + "rewards/equation_reward_func/mean": 0.1953125, + "rewards/equation_reward_func/std": 0.3979988098144531, + "rewards/format_reward_func/mean": 0.671875, + "rewards/format_reward_func/std": 0.4713755249977112, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49422222222222223, + "grad_norm": 3.051331037407785, + "kl": 0.939697265625, + "learning_rate": 4.41613995896744e-07, + "loss": 0.0009, + "num_tokens": 20850102.0, + "reward": 0.3671875, + "reward_std": 0.19056487083435059, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.640625, + "rewards/format_reward_func/std": 0.481702595949173, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.49777777777777776, + "grad_norm": 4.814397537365208, + "kl": 1.9078369140625, + "learning_rate": 4.40685205143294e-07, + "loss": 0.0019, + "num_tokens": 21000234.0, + "reward": 0.375, + "reward_std": 0.1520632803440094, + "rewards/equation_reward_func/mean": 0.1328125, + "rewards/equation_reward_func/std": 0.3407054841518402, + "rewards/format_reward_func/mean": 0.6171875, + "rewards/format_reward_func/std": 0.4879830479621887, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5013333333333333, + "grad_norm": 3.2517027439890924, + "kl": 0.801025390625, + "learning_rate": 4.3975007828234914e-07, + "loss": 0.0008, + "num_tokens": 21150326.0, + "reward": 0.43359375, + "reward_std": 0.2021140456199646, + "rewards/equation_reward_func/mean": 0.1640625, + "rewards/equation_reward_func/std": 0.371787428855896, + "rewards/format_reward_func/mean": 0.703125, + "rewards/format_reward_func/std": 0.45867621898651123, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5048888888888889, + "grad_norm": 1.663987879451297, + "kl": 0.525634765625, + "learning_rate": 4.3880864638640035e-07, + "loss": 0.0005, + "num_tokens": 21300382.0, + "reward": 0.4609375, + "reward_std": 0.17912657558918, + "rewards/equation_reward_func/mean": 0.1796875, + "rewards/equation_reward_func/std": 0.3854354918003082, + "rewards/format_reward_func/mean": 0.7421875, + "rewards/format_reward_func/std": 0.43914902210235596, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5084444444444445, + "grad_norm": 1.6351203503838305, + "kl": 0.519287109375, + "learning_rate": 4.37860940737443e-07, + "loss": 0.0005, + "num_tokens": 21450566.0, + "reward": 0.26171875, + "reward_std": 0.21883678436279297, + "rewards/equation_reward_func/mean": 0.015625, + "rewards/equation_reward_func/std": 0.12450689822435379, + "rewards/format_reward_func/mean": 0.5078125, + "rewards/format_reward_func/std": 0.5019033551216125, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.512, + "grad_norm": 1.366258610631795, + "kl": 0.574462890625, + "learning_rate": 4.3690699282593723e-07, + "loss": 0.0006, + "num_tokens": 21600694.0, + "reward": 0.4140625, + "reward_std": 0.22181487083435059, + "rewards/equation_reward_func/mean": 0.15625, + "rewards/equation_reward_func/std": 0.3645188808441162, + "rewards/format_reward_func/mean": 0.671875, + "rewards/format_reward_func/std": 0.4713755249977112, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5155555555555555, + "grad_norm": 1.2979528663337039, + "kl": 0.371826171875, + "learning_rate": 4.3594683434976186e-07, + "loss": 0.0004, + "num_tokens": 21750818.0, + "reward": 0.41015625, + "reward_std": 0.23919188976287842, + "rewards/equation_reward_func/mean": 0.1875, + "rewards/equation_reward_func/std": 0.39184603095054626, + "rewards/format_reward_func/mean": 0.6328125, + "rewards/format_reward_func/std": 0.4839322865009308, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5191111111111111, + "grad_norm": 13.971745146487256, + "kl": 2.259521484375, + "learning_rate": 4.3498049721316087e-07, + "loss": 0.0023, + "num_tokens": 21900886.0, + "reward": 0.48828125, + "reward_std": 0.18275237083435059, + "rewards/equation_reward_func/mean": 0.28125, + "rewards/equation_reward_func/std": 0.4513758420944214, + "rewards/format_reward_func/mean": 0.6953125, + "rewards/format_reward_func/std": 0.46208351850509644, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5226666666666666, + "grad_norm": 1.5151109592881105, + "kl": 0.50927734375, + "learning_rate": 4.340080135256835e-07, + "loss": 0.0005, + "num_tokens": 22050994.0, + "reward": 0.375, + "reward_std": 0.1720726490020752, + "rewards/equation_reward_func/mean": 0.140625, + "rewards/equation_reward_func/std": 0.3490002751350403, + "rewards/format_reward_func/mean": 0.609375, + "rewards/format_reward_func/std": 0.4898075461387634, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5262222222222223, + "grad_norm": 3.565613250452234, + "kl": 1.1435546875, + "learning_rate": 4.3302941560111716e-07, + "loss": 0.0011, + "num_tokens": 22201058.0, + "reward": 0.44140625, + "reward_std": 0.1594257354736328, + "rewards/equation_reward_func/mean": 0.1953125, + "rewards/equation_reward_func/std": 0.3979988098144531, + "rewards/format_reward_func/mean": 0.6875, + "rewards/format_reward_func/std": 0.4653336703777313, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5297777777777778, + "grad_norm": 1.1082031850469054, + "kl": 0.389892578125, + "learning_rate": 4.3204473595641367e-07, + "loss": 0.0004, + "num_tokens": 22351074.0, + "reward": 0.5, + "reward_std": 0.17966999113559723, + "rewards/equation_reward_func/mean": 0.234375, + "rewards/equation_reward_func/std": 0.42527204751968384, + "rewards/format_reward_func/mean": 0.765625, + "rewards/format_reward_func/std": 0.42527204751968384, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5333333333333333, + "grad_norm": 29.666704145228632, + "kl": 5.103271484375, + "learning_rate": 4.3105400731060896e-07, + "loss": 0.0051, + "num_tokens": 22501194.0, + "reward": 0.44140625, + "reward_std": 0.161842942237854, + "rewards/equation_reward_func/mean": 0.1328125, + "rewards/equation_reward_func/std": 0.3407054841518402, + "rewards/format_reward_func/mean": 0.75, + "rewards/format_reward_func/std": 0.434714138507843, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5368888888888889, + "grad_norm": 72.53647797594527, + "kl": 2.019775390625, + "learning_rate": 4.300572625837359e-07, + "loss": 0.002, + "num_tokens": 22651314.0, + "reward": 0.3828125, + "reward_std": 0.20705929398536682, + "rewards/equation_reward_func/mean": 0.0546875, + "rewards/equation_reward_func/std": 0.22826264798641205, + "rewards/format_reward_func/mean": 0.7109375, + "rewards/format_reward_func/std": 0.45510825514793396, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5404444444444444, + "grad_norm": 1.1753775278093777, + "kl": 0.44921875, + "learning_rate": 4.2905453489573007e-07, + "loss": 0.0004, + "num_tokens": 22801386.0, + "reward": 0.41015625, + "reward_std": 0.15327188372612, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.7265625, + "rewards/format_reward_func/std": 0.447474867105484, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.544, + "grad_norm": 1.8103430830421683, + "kl": 0.861328125, + "learning_rate": 4.280458575653296e-07, + "loss": 0.0009, + "num_tokens": 22951478.0, + "reward": 0.4375, + "reward_std": 0.16887937486171722, + "rewards/equation_reward_func/mean": 0.1484375, + "rewards/equation_reward_func/std": 0.356930136680603, + "rewards/format_reward_func/mean": 0.7265625, + "rewards/format_reward_func/std": 0.447474867105484, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5475555555555556, + "grad_norm": 92.17653023102632, + "kl": 29.57275390625, + "learning_rate": 4.2703126410896815e-07, + "loss": 0.0296, + "num_tokens": 23101526.0, + "reward": 0.421875, + "reward_std": 0.2033226490020752, + "rewards/equation_reward_func/mean": 0.1171875, + "rewards/equation_reward_func/std": 0.322907418012619, + "rewards/format_reward_func/mean": 0.7265625, + "rewards/format_reward_func/std": 0.447474867105484, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5511111111111111, + "grad_norm": 1.8601363913983529, + "kl": 0.711669921875, + "learning_rate": 4.2601078823966065e-07, + "loss": 0.0007, + "num_tokens": 23251622.0, + "reward": 0.3984375, + "reward_std": 0.23224879801273346, + "rewards/equation_reward_func/mean": 0.125, + "rewards/equation_reward_func/std": 0.3320184051990509, + "rewards/format_reward_func/mean": 0.671875, + "rewards/format_reward_func/std": 0.4713755249977112, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5546666666666666, + "grad_norm": 1.9655716755995285, + "kl": 0.5400390625, + "learning_rate": 4.249844638658837e-07, + "loss": 0.0005, + "num_tokens": 23401694.0, + "reward": 0.4140625, + "reward_std": 0.1894671618938446, + "rewards/equation_reward_func/mean": 0.109375, + "rewards/equation_reward_func/std": 0.31333550810813904, + "rewards/format_reward_func/mean": 0.71875, + "rewards/format_reward_func/std": 0.4513758420944214, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5582222222222222, + "grad_norm": 1.2512804150325183, + "kl": 0.381103515625, + "learning_rate": 4.2395232509044856e-07, + "loss": 0.0004, + "num_tokens": 23551754.0, + "reward": 0.4140625, + "reward_std": 0.15502388775348663, + "rewards/equation_reward_func/mean": 0.046875, + "rewards/equation_reward_func/std": 0.21220162510871887, + "rewards/format_reward_func/mean": 0.78125, + "rewards/format_reward_func/std": 0.41502299904823303, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5617777777777778, + "grad_norm": 4.6864178808609065, + "kl": 1.0859375, + "learning_rate": 4.229144062093679e-07, + "loss": 0.0011, + "num_tokens": 23701886.0, + "reward": 0.40234375, + "reward_std": 0.17801132798194885, + "rewards/equation_reward_func/mean": 0.125, + "rewards/equation_reward_func/std": 0.3320184051990509, + "rewards/format_reward_func/mean": 0.6796875, + "rewards/format_reward_func/std": 0.4684300124645233, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5653333333333334, + "grad_norm": 1.2185679157403768, + "kl": 0.41064453125, + "learning_rate": 4.218707417107166e-07, + "loss": 0.0004, + "num_tokens": 23852050.0, + "reward": 0.37109375, + "reward_std": 0.1713140904903412, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.6796875, + "rewards/format_reward_func/std": 0.4684300124645233, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5688888888888889, + "grad_norm": 1.248032189193752, + "kl": 0.443359375, + "learning_rate": 4.208213662734852e-07, + "loss": 0.0004, + "num_tokens": 24002034.0, + "reward": 0.48828125, + "reward_std": 0.14753741025924683, + "rewards/equation_reward_func/mean": 0.1171875, + "rewards/equation_reward_func/std": 0.322907418012619, + "rewards/format_reward_func/mean": 0.859375, + "rewards/format_reward_func/std": 0.3490002751350403, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5724444444444444, + "grad_norm": 1.0325417882527244, + "kl": 0.435546875, + "learning_rate": 4.197663147664281e-07, + "loss": 0.0004, + "num_tokens": 24152122.0, + "reward": 0.44140625, + "reward_std": 0.1556890904903412, + "rewards/equation_reward_func/mean": 0.0546875, + "rewards/equation_reward_func/std": 0.22826264798641205, + "rewards/format_reward_func/mean": 0.828125, + "rewards/format_reward_func/std": 0.3787541687488556, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.576, + "grad_norm": 1.1687197449448241, + "kl": 0.60693359375, + "learning_rate": 4.187056222469046e-07, + "loss": 0.0006, + "num_tokens": 24302178.0, + "reward": 0.5, + "reward_std": 0.18526950478553772, + "rewards/equation_reward_func/mean": 0.1484375, + "rewards/equation_reward_func/std": 0.356930136680603, + "rewards/format_reward_func/mean": 0.8515625, + "rewards/format_reward_func/std": 0.356930136680603, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5795555555555556, + "grad_norm": 0.8560481833926494, + "kl": 0.405029296875, + "learning_rate": 4.1763932395971433e-07, + "loss": 0.0004, + "num_tokens": 24452246.0, + "reward": 0.51953125, + "reward_std": 0.08835469186306, + "rewards/equation_reward_func/mean": 0.140625, + "rewards/equation_reward_func/std": 0.3490002751350403, + "rewards/format_reward_func/mean": 0.8984375, + "rewards/format_reward_func/std": 0.3032590448856354, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5831111111111111, + "grad_norm": 0.9227050375160734, + "kl": 0.361572265625, + "learning_rate": 4.1656745533592565e-07, + "loss": 0.0004, + "num_tokens": 24602314.0, + "reward": 0.51953125, + "reward_std": 0.112550750374794, + "rewards/equation_reward_func/mean": 0.140625, + "rewards/equation_reward_func/std": 0.3490002751350403, + "rewards/format_reward_func/mean": 0.8984375, + "rewards/format_reward_func/std": 0.3032590448856354, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5866666666666667, + "grad_norm": 2.9673521250500965, + "kl": 0.835693359375, + "learning_rate": 4.1549005199169887e-07, + "loss": 0.0008, + "num_tokens": 24752402.0, + "reward": 0.48828125, + "reward_std": 0.125758558511734, + "rewards/equation_reward_func/mean": 0.1015625, + "rewards/equation_reward_func/std": 0.3032590448856354, + "rewards/format_reward_func/mean": 0.875, + "rewards/format_reward_func/std": 0.3320184051990509, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5902222222222222, + "grad_norm": 59803.04638668722, + "kl": 5376.424560546875, + "learning_rate": 4.1440714972710245e-07, + "loss": 5.3755, + "num_tokens": 24902430.0, + "reward": 0.50390625, + "reward_std": 0.11067695170640945, + "rewards/equation_reward_func/mean": 0.1328125, + "rewards/equation_reward_func/std": 0.3407054841518402, + "rewards/format_reward_func/mean": 0.875, + "rewards/format_reward_func/std": 0.3320184051990509, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5937777777777777, + "grad_norm": 0.9872798550394157, + "kl": 0.467041015625, + "learning_rate": 4.1331878452492366e-07, + "loss": 0.0005, + "num_tokens": 25052534.0, + "reward": 0.53125, + "reward_std": 0.135988250374794, + "rewards/equation_reward_func/mean": 0.1640625, + "rewards/equation_reward_func/std": 0.371787428855896, + "rewards/format_reward_func/mean": 0.8984375, + "rewards/format_reward_func/std": 0.3032590448856354, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.5973333333333334, + "grad_norm": 0.9968829039320042, + "kl": 0.5625, + "learning_rate": 4.122249925494726e-07, + "loss": 0.0006, + "num_tokens": 25202638.0, + "reward": 0.546875, + "reward_std": 0.09616719186306, + "rewards/equation_reward_func/mean": 0.1953125, + "rewards/equation_reward_func/std": 0.3979988098144531, + "rewards/format_reward_func/mean": 0.8984375, + "rewards/format_reward_func/std": 0.3032590448856354, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6008888888888889, + "grad_norm": 1.407994078507014, + "kl": 0.69482421875, + "learning_rate": 4.111258101453809e-07, + "loss": 0.0007, + "num_tokens": 25352698.0, + "reward": 0.47265625, + "reward_std": 0.07933359593153, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.8828125, + "rewards/format_reward_func/std": 0.322907418012619, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6044444444444445, + "grad_norm": 0.9550324441837219, + "kl": 0.371337890625, + "learning_rate": 4.10021273836394e-07, + "loss": 0.0004, + "num_tokens": 25502766.0, + "reward": 0.47265625, + "reward_std": 0.09077189117670059, + "rewards/equation_reward_func/mean": 0.046875, + "rewards/equation_reward_func/std": 0.21220162510871887, + "rewards/format_reward_func/mean": 0.8984375, + "rewards/format_reward_func/std": 0.3032590448856354, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.608, + "grad_norm": 0.9530508768607797, + "kl": 0.538818359375, + "learning_rate": 4.0891142032415717e-07, + "loss": 0.0005, + "num_tokens": 25652830.0, + "reward": 0.4765625, + "reward_std": 0.0739382952451706, + "rewards/equation_reward_func/mean": 0.046875, + "rewards/equation_reward_func/std": 0.21220162510871887, + "rewards/format_reward_func/mean": 0.90625, + "rewards/format_reward_func/std": 0.29262590408325195, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6115555555555555, + "grad_norm": 1.3515658288339731, + "kl": 0.4130859375, + "learning_rate": 4.0779628648699647e-07, + "loss": 0.0004, + "num_tokens": 25802866.0, + "reward": 0.46484375, + "reward_std": 0.11960469186306, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.8359375, + "rewards/format_reward_func/std": 0.371787428855896, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6151111111111112, + "grad_norm": 5.062381594275897, + "kl": 1.2841796875, + "learning_rate": 4.066759093786931e-07, + "loss": 0.0013, + "num_tokens": 25952918.0, + "reward": 0.5703125, + "reward_std": 0.08054219186306, + "rewards/equation_reward_func/mean": 0.1953125, + "rewards/equation_reward_func/std": 0.3979988098144531, + "rewards/format_reward_func/mean": 0.9453125, + "rewards/format_reward_func/std": 0.22826264798641205, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6186666666666667, + "grad_norm": 1.884658172401711, + "kl": 0.83984375, + "learning_rate": 4.055503262272521e-07, + "loss": 0.0008, + "num_tokens": 26103022.0, + "reward": 0.5, + "reward_std": 0.05589609593153, + "rewards/equation_reward_func/mean": 0.078125, + "rewards/equation_reward_func/std": 0.2694226801395416, + "rewards/format_reward_func/mean": 0.921875, + "rewards/format_reward_func/std": 0.2694226801395416, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6222222222222222, + "grad_norm": 0.899477885337483, + "kl": 0.36328125, + "learning_rate": 4.044195744336656e-07, + "loss": 0.0004, + "num_tokens": 26253170.0, + "reward": 0.50390625, + "reward_std": 0.1130007952451706, + "rewards/equation_reward_func/mean": 0.125, + "rewards/equation_reward_func/std": 0.3320184051990509, + "rewards/format_reward_func/mean": 0.8828125, + "rewards/format_reward_func/std": 0.322907418012619, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6257777777777778, + "grad_norm": 1.1287658852126066, + "kl": 0.556884765625, + "learning_rate": 4.0328369157066975e-07, + "loss": 0.0006, + "num_tokens": 26403294.0, + "reward": 0.5, + "reward_std": 0.10277109593153, + "rewards/equation_reward_func/mean": 0.1015625, + "rewards/equation_reward_func/std": 0.3032590448856354, + "rewards/format_reward_func/mean": 0.8984375, + "rewards/format_reward_func/std": 0.3032590448856354, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6293333333333333, + "grad_norm": 0.9554300863560559, + "kl": 0.439453125, + "learning_rate": 4.021427153814965e-07, + "loss": 0.0004, + "num_tokens": 26553438.0, + "reward": 0.49609375, + "reward_std": 0.0859375, + "rewards/equation_reward_func/mean": 0.0546875, + "rewards/equation_reward_func/std": 0.22826264798641205, + "rewards/format_reward_func/mean": 0.9375, + "rewards/format_reward_func/std": 0.24301259219646454, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6328888888888888, + "grad_norm": 1.8696085163129879, + "kl": 0.6962890625, + "learning_rate": 4.009966837786194e-07, + "loss": 0.0007, + "num_tokens": 26703546.0, + "reward": 0.45703125, + "reward_std": 0.12014809250831604, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.8515625, + "rewards/format_reward_func/std": 0.356930136680603, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6364444444444445, + "grad_norm": 0.8445236924167039, + "kl": 0.44580078125, + "learning_rate": 3.9984563484249355e-07, + "loss": 0.0004, + "num_tokens": 26853626.0, + "reward": 0.46484375, + "reward_std": 0.06370859593153, + "rewards/equation_reward_func/mean": 0.0, + "rewards/equation_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.9296875, + "rewards/format_reward_func/std": 0.2566775679588318, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.64, + "grad_norm": 8.514060274236515, + "kl": 1.341064453125, + "learning_rate": 3.98689606820291e-07, + "loss": 0.0013, + "num_tokens": 27003610.0, + "reward": 0.52734375, + "reward_std": 0.07933359593153, + "rewards/equation_reward_func/mean": 0.1328125, + "rewards/equation_reward_func/std": 0.3407054841518402, + "rewards/format_reward_func/mean": 0.921875, + "rewards/format_reward_func/std": 0.2694226801395416, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6435555555555555, + "grad_norm": 1.0918541430694348, + "kl": 0.39501953125, + "learning_rate": 3.975286381246288e-07, + "loss": 0.0004, + "num_tokens": 27153798.0, + "reward": 0.4609375, + "reward_std": 0.08714609593153, + "rewards/equation_reward_func/mean": 0.015625, + "rewards/equation_reward_func/std": 0.12450689822435379, + "rewards/format_reward_func/mean": 0.90625, + "rewards/format_reward_func/std": 0.29262590408325195, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6471111111111111, + "grad_norm": 0.6639080655090452, + "kl": 0.39892578125, + "learning_rate": 3.963627673322936e-07, + "loss": 0.0004, + "num_tokens": 27303818.0, + "reward": 0.55078125, + "reward_std": 0.0703125, + "rewards/equation_reward_func/mean": 0.140625, + "rewards/equation_reward_func/std": 0.3490002751350403, + "rewards/format_reward_func/mean": 0.9609375, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6506666666666666, + "grad_norm": 0.8008104294625259, + "kl": 0.34619140625, + "learning_rate": 3.951920331829592e-07, + "loss": 0.0003, + "num_tokens": 27453838.0, + "reward": 0.53125, + "reward_std": 0.0625, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6542222222222223, + "grad_norm": 2.4570583505386767, + "kl": 0.77587890625, + "learning_rate": 3.9401647457789977e-07, + "loss": 0.0008, + "num_tokens": 27603914.0, + "reward": 0.49609375, + "reward_std": 0.07933359593153, + "rewards/equation_reward_func/mean": 0.078125, + "rewards/equation_reward_func/std": 0.2694226801395416, + "rewards/format_reward_func/mean": 0.9140625, + "rewards/format_reward_func/std": 0.2813730239868164, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6577777777777778, + "grad_norm": 0.7694633251007069, + "kl": 0.36328125, + "learning_rate": 3.9283613057869683e-07, + "loss": 0.0004, + "num_tokens": 27754050.0, + "reward": 0.5, + "reward_std": 0.0739382952451706, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.9375, + "rewards/format_reward_func/std": 0.24301259219646454, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6613333333333333, + "grad_norm": 49.68125956240344, + "kl": 6.21240234375, + "learning_rate": 3.9165104040594144e-07, + "loss": 0.0062, + "num_tokens": 27904122.0, + "reward": 0.48828125, + "reward_std": 0.11541798710823059, + "rewards/equation_reward_func/mean": 0.1015625, + "rewards/equation_reward_func/std": 0.3032590448856354, + "rewards/format_reward_func/mean": 0.875, + "rewards/format_reward_func/std": 0.3320184051990509, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6648888888888889, + "grad_norm": 1.597010810734867, + "kl": 0.74072265625, + "learning_rate": 3.9046124343793104e-07, + "loss": 0.0007, + "num_tokens": 28054230.0, + "reward": 0.50390625, + "reward_std": 0.08835469186306, + "rewards/equation_reward_func/mean": 0.0859375, + "rewards/equation_reward_func/std": 0.2813730239868164, + "rewards/format_reward_func/mean": 0.921875, + "rewards/format_reward_func/std": 0.2694226801395416, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6684444444444444, + "grad_norm": 0.8478903871951797, + "kl": 0.490966796875, + "learning_rate": 3.8926677920936093e-07, + "loss": 0.0005, + "num_tokens": 28204330.0, + "reward": 0.53125, + "reward_std": 0.09858439117670059, + "rewards/equation_reward_func/mean": 0.140625, + "rewards/equation_reward_func/std": 0.3490002751350403, + "rewards/format_reward_func/mean": 0.921875, + "rewards/format_reward_func/std": 0.2694226801395416, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.672, + "grad_norm": 0.7803466603833829, + "kl": 0.361328125, + "learning_rate": 3.880676874100106e-07, + "loss": 0.0004, + "num_tokens": 28354446.0, + "reward": 0.56640625, + "reward_std": 0.0546875, + "rewards/equation_reward_func/mean": 0.1796875, + "rewards/equation_reward_func/std": 0.3854354918003082, + "rewards/format_reward_func/mean": 0.953125, + "rewards/format_reward_func/std": 0.21220162510871887, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6755555555555556, + "grad_norm": 0.6667441813180046, + "kl": 0.370361328125, + "learning_rate": 3.868640078834251e-07, + "loss": 0.0004, + "num_tokens": 28504506.0, + "reward": 0.53515625, + "reward_std": 0.06370859593153, + "rewards/equation_reward_func/mean": 0.1015625, + "rewards/equation_reward_func/std": 0.3032590448856354, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6791111111111111, + "grad_norm": 0.9717479349571527, + "kl": 0.352783203125, + "learning_rate": 3.856557806255907e-07, + "loss": 0.0004, + "num_tokens": 28654702.0, + "reward": 0.50390625, + "reward_std": 0.11058359593153, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.9140625, + "rewards/format_reward_func/std": 0.2813730239868164, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6826666666666666, + "grad_norm": 0.6531105470033437, + "kl": 0.4169921875, + "learning_rate": 3.844430457836064e-07, + "loss": 0.0004, + "num_tokens": 28804750.0, + "reward": 0.53125, + "reward_std": 0.05589609593153, + "rewards/equation_reward_func/mean": 0.109375, + "rewards/equation_reward_func/std": 0.31333550810813904, + "rewards/format_reward_func/mean": 0.953125, + "rewards/format_reward_func/std": 0.21220162510871887, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6862222222222222, + "grad_norm": 0.7581030751348999, + "kl": 0.4326171875, + "learning_rate": 3.8322584365434934e-07, + "loss": 0.0004, + "num_tokens": 28954890.0, + "reward": 0.53515625, + "reward_std": 0.08548745512962341, + "rewards/equation_reward_func/mean": 0.15625, + "rewards/equation_reward_func/std": 0.3645188808441162, + "rewards/format_reward_func/mean": 0.9140625, + "rewards/format_reward_func/std": 0.2813730239868164, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6897777777777778, + "grad_norm": 0.8123280487022461, + "kl": 0.378173828125, + "learning_rate": 3.8200421468313646e-07, + "loss": 0.0004, + "num_tokens": 29104990.0, + "reward": 0.515625, + "reward_std": 0.07767495512962341, + "rewards/equation_reward_func/mean": 0.0859375, + "rewards/equation_reward_func/std": 0.2813730239868164, + "rewards/format_reward_func/mean": 0.9453125, + "rewards/format_reward_func/std": 0.22826264798641205, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6933333333333334, + "grad_norm": 0.9557167275680298, + "kl": 0.458251953125, + "learning_rate": 3.807781994623802e-07, + "loss": 0.0005, + "num_tokens": 29255106.0, + "reward": 0.4765625, + "reward_std": 0.09616719186306, + "rewards/equation_reward_func/mean": 0.0546875, + "rewards/equation_reward_func/std": 0.22826264798641205, + "rewards/format_reward_func/mean": 0.8984375, + "rewards/format_reward_func/std": 0.3032590448856354, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.6968888888888889, + "grad_norm": 20.914129417208414, + "kl": 2.05908203125, + "learning_rate": 3.7954783873023946e-07, + "loss": 0.0021, + "num_tokens": 29405238.0, + "reward": 0.51171875, + "reward_std": 0.08416798710823059, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.9296875, + "rewards/format_reward_func/std": 0.2566775679588318, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7004444444444444, + "grad_norm": 0.8290581326894807, + "kl": 0.460693359375, + "learning_rate": 3.7831317336926674e-07, + "loss": 0.0005, + "num_tokens": 29555306.0, + "reward": 0.53125, + "reward_std": 0.0739382952451706, + "rewards/equation_reward_func/mean": 0.1171875, + "rewards/equation_reward_func/std": 0.322907418012619, + "rewards/format_reward_func/mean": 0.9453125, + "rewards/format_reward_func/std": 0.22826264798641205, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.704, + "grad_norm": 0.7450940079039334, + "kl": 0.443115234375, + "learning_rate": 3.7707424440504863e-07, + "loss": 0.0004, + "num_tokens": 29705410.0, + "reward": 0.48046875, + "reward_std": 0.06370859593153, + "rewards/equation_reward_func/mean": 0.0234375, + "rewards/equation_reward_func/std": 0.15188287198543549, + "rewards/format_reward_func/mean": 0.9375, + "rewards/format_reward_func/std": 0.24301259219646454, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7075555555555556, + "grad_norm": 0.7205252163706303, + "kl": 0.453125, + "learning_rate": 3.758310930048436e-07, + "loss": 0.0005, + "num_tokens": 29855442.0, + "reward": 0.53125, + "reward_std": 0.0625, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7111111111111111, + "grad_norm": 1.0214742583016907, + "kl": 0.45263671875, + "learning_rate": 3.7458376047621356e-07, + "loss": 0.0005, + "num_tokens": 30005478.0, + "reward": 0.55078125, + "reward_std": 0.12443909049034119, + "rewards/equation_reward_func/mean": 0.171875, + "rewards/equation_reward_func/std": 0.3787541687488556, + "rewards/format_reward_func/mean": 0.9296875, + "rewards/format_reward_func/std": 0.2566775679588318, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7146666666666667, + "grad_norm": 4.352162912273971, + "kl": 1.650634765625, + "learning_rate": 3.733322882656511e-07, + "loss": 0.0016, + "num_tokens": 30155550.0, + "reward": 0.515625, + "reward_std": 0.08427885174751282, + "rewards/equation_reward_func/mean": 0.1015625, + "rewards/equation_reward_func/std": 0.3032590448856354, + "rewards/format_reward_func/mean": 0.9296875, + "rewards/format_reward_func/std": 0.2566775679588318, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7182222222222222, + "grad_norm": 0.8642171883878098, + "kl": 0.646728515625, + "learning_rate": 3.7207671795720296e-07, + "loss": 0.0006, + "num_tokens": 30305614.0, + "reward": 0.53125, + "reward_std": 0.0625, + "rewards/equation_reward_func/mean": 0.1015625, + "rewards/equation_reward_func/std": 0.3032590448856354, + "rewards/format_reward_func/mean": 0.9609375, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7217777777777777, + "grad_norm": 0.9770611026209789, + "kl": 0.48193359375, + "learning_rate": 3.7081709127108767e-07, + "loss": 0.0005, + "num_tokens": 30455722.0, + "reward": 0.50390625, + "reward_std": 0.03245859593153, + "rewards/equation_reward_func/mean": 0.0234375, + "rewards/equation_reward_func/std": 0.15188287198543549, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7253333333333334, + "grad_norm": 0.9670302361571421, + "kl": 0.5126953125, + "learning_rate": 3.695534500623096e-07, + "loss": 0.0005, + "num_tokens": 30605826.0, + "reward": 0.48828125, + "reward_std": 0.04808359593153, + "rewards/equation_reward_func/mean": 0.015625, + "rewards/equation_reward_func/std": 0.12450689822435379, + "rewards/format_reward_func/mean": 0.9609375, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7288888888888889, + "grad_norm": 3.23663467525944, + "kl": 0.73828125, + "learning_rate": 3.68285836319268e-07, + "loss": 0.0007, + "num_tokens": 30755898.0, + "reward": 0.49609375, + "reward_std": 0.04808359593153, + "rewards/equation_reward_func/mean": 0.046875, + "rewards/equation_reward_func/std": 0.21220162510871887, + "rewards/format_reward_func/mean": 0.9453125, + "rewards/format_reward_func/std": 0.22826264798641205, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7324444444444445, + "grad_norm": 6.934307670841274, + "kl": 0.99365234375, + "learning_rate": 3.6701429216236204e-07, + "loss": 0.001, + "num_tokens": 30905906.0, + "reward": 0.5703125, + "reward_std": 0.08054219186306, + "rewards/equation_reward_func/mean": 0.1875, + "rewards/equation_reward_func/std": 0.39184603095054626, + "rewards/format_reward_func/mean": 0.953125, + "rewards/format_reward_func/std": 0.21220162510871887, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.736, + "grad_norm": 0.8933825229974401, + "kl": 0.4169921875, + "learning_rate": 3.657388598425908e-07, + "loss": 0.0004, + "num_tokens": 31055970.0, + "reward": 0.5390625, + "reward_std": 0.10892495512962341, + "rewards/equation_reward_func/mean": 0.15625, + "rewards/equation_reward_func/std": 0.3645188808441162, + "rewards/format_reward_func/mean": 0.921875, + "rewards/format_reward_func/std": 0.2694226801395416, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7395555555555555, + "grad_norm": 0.7876520940886738, + "kl": 0.554931640625, + "learning_rate": 3.644595817401501e-07, + "loss": 0.0006, + "num_tokens": 31206030.0, + "reward": 0.5390625, + "reward_std": 0.046875, + "rewards/equation_reward_func/mean": 0.109375, + "rewards/equation_reward_func/std": 0.31333550810813904, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7431111111111111, + "grad_norm": 0.30620401262295277, + "kl": 0.384521484375, + "learning_rate": 3.631765003630233e-07, + "loss": 0.0004, + "num_tokens": 31356098.0, + "reward": 0.51953125, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.0390625, + "rewards/equation_reward_func/std": 0.194504976272583, + "rewards/format_reward_func/mean": 1.0, + "rewards/format_reward_func/std": 0.0, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7466666666666667, + "grad_norm": 0.565341568697795, + "kl": 0.37255859375, + "learning_rate": 3.6188965834556964e-07, + "loss": 0.0004, + "num_tokens": 31506174.0, + "reward": 0.54296875, + "reward_std": 0.041479695588350296, + "rewards/equation_reward_func/mean": 0.1015625, + "rewards/equation_reward_func/std": 0.3032590448856354, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7502222222222222, + "grad_norm": 0.7183038048403314, + "kl": 0.530029296875, + "learning_rate": 3.605990984471073e-07, + "loss": 0.0005, + "num_tokens": 31656230.0, + "reward": 0.53515625, + "reward_std": 0.05050079524517059, + "rewards/equation_reward_func/mean": 0.078125, + "rewards/equation_reward_func/std": 0.2694226801395416, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7537777777777778, + "grad_norm": 0.9592998793501326, + "kl": 0.6806640625, + "learning_rate": 3.5930486355049254e-07, + "loss": 0.0007, + "num_tokens": 31806246.0, + "reward": 0.5625, + "reward_std": 0.09616719186306, + "rewards/equation_reward_func/mean": 0.171875, + "rewards/equation_reward_func/std": 0.3787541687488556, + "rewards/format_reward_func/mean": 0.953125, + "rewards/format_reward_func/std": 0.21220162510871887, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7573333333333333, + "grad_norm": 0.5967010195878546, + "kl": 0.388671875, + "learning_rate": 3.580069966606949e-07, + "loss": 0.0004, + "num_tokens": 31956354.0, + "reward": 0.52734375, + "reward_std": 0.045216359198093414, + "rewards/equation_reward_func/mean": 0.1015625, + "rewards/equation_reward_func/std": 0.3032590448856354, + "rewards/format_reward_func/mean": 0.953125, + "rewards/format_reward_func/std": 0.21220162510871887, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7608888888888888, + "grad_norm": 0.5984534897529947, + "kl": 0.3974609375, + "learning_rate": 3.5670554090336804e-07, + "loss": 0.0004, + "num_tokens": 32106454.0, + "reward": 0.515625, + "reward_std": 0.046875, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7644444444444445, + "grad_norm": 1.527700749087867, + "kl": 0.652587890625, + "learning_rate": 3.55400539523417e-07, + "loss": 0.0007, + "num_tokens": 32256526.0, + "reward": 0.53125, + "reward_std": 0.046875, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.768, + "grad_norm": 0.7168370595424849, + "kl": 0.469970703125, + "learning_rate": 3.5409203588356096e-07, + "loss": 0.0005, + "num_tokens": 32406606.0, + "reward": 0.546875, + "reward_std": 0.05589609593153, + "rewards/equation_reward_func/mean": 0.1328125, + "rewards/equation_reward_func/std": 0.3407054841518402, + "rewards/format_reward_func/mean": 0.9609375, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7715555555555556, + "grad_norm": 0.6937218726790086, + "kl": 0.48388671875, + "learning_rate": 3.527800734628927e-07, + "loss": 0.0005, + "num_tokens": 32556662.0, + "reward": 0.55859375, + "reward_std": 0.03245859593153, + "rewards/equation_reward_func/mean": 0.125, + "rewards/equation_reward_func/std": 0.3320184051990509, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7751111111111111, + "grad_norm": 0.5625106257141403, + "kl": 0.36279296875, + "learning_rate": 3.5146469585543386e-07, + "loss": 0.0004, + "num_tokens": 32706774.0, + "reward": 0.546875, + "reward_std": 0.05170939117670059, + "rewards/equation_reward_func/mean": 0.109375, + "rewards/equation_reward_func/std": 0.31333550810813904, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7786666666666666, + "grad_norm": 1.8811641758087445, + "kl": 0.4970703125, + "learning_rate": 3.501459467686859e-07, + "loss": 0.0005, + "num_tokens": 32856802.0, + "reward": 0.5390625, + "reward_std": 0.04027109593153, + "rewards/equation_reward_func/mean": 0.0859375, + "rewards/equation_reward_func/std": 0.2813730239868164, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7822222222222223, + "grad_norm": 0.717878846553723, + "kl": 0.44091796875, + "learning_rate": 3.4882387002217837e-07, + "loss": 0.0004, + "num_tokens": 33006962.0, + "reward": 0.51171875, + "reward_std": 0.0703125, + "rewards/equation_reward_func/mean": 0.078125, + "rewards/equation_reward_func/std": 0.2694226801395416, + "rewards/format_reward_func/mean": 0.9453125, + "rewards/format_reward_func/std": 0.22826264798641205, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7857777777777778, + "grad_norm": 4.888546951880972, + "kl": 1.05029296875, + "learning_rate": 3.474985095460127e-07, + "loss": 0.0011, + "num_tokens": 33157078.0, + "reward": 0.5625, + "reward_std": 0.07152109593153, + "rewards/equation_reward_func/mean": 0.15625, + "rewards/equation_reward_func/std": 0.3645188808441162, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7893333333333333, + "grad_norm": 0.9668364386663721, + "kl": 0.631591796875, + "learning_rate": 3.4616990937940207e-07, + "loss": 0.0006, + "num_tokens": 33307122.0, + "reward": 0.515625, + "reward_std": 0.046875, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7928888888888889, + "grad_norm": 0.5721228641007269, + "kl": 0.40625, + "learning_rate": 3.448381136692089e-07, + "loss": 0.0004, + "num_tokens": 33457150.0, + "reward": 0.53515625, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.0859375, + "rewards/equation_reward_func/std": 0.2813730239868164, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.7964444444444444, + "grad_norm": 0.7641757061343398, + "kl": 0.369384765625, + "learning_rate": 3.435031666684771e-07, + "loss": 0.0004, + "num_tokens": 33607274.0, + "reward": 0.48828125, + "reward_std": 0.06986245512962341, + "rewards/equation_reward_func/mean": 0.0390625, + "rewards/equation_reward_func/std": 0.194504976272583, + "rewards/format_reward_func/mean": 0.9375, + "rewards/format_reward_func/std": 0.24301259219646454, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8, + "grad_norm": 0.676066907329912, + "kl": 0.421630859375, + "learning_rate": 3.421651127349622e-07, + "loss": 0.0004, + "num_tokens": 33757358.0, + "reward": 0.53125, + "reward_std": 0.04929219186306, + "rewards/equation_reward_func/mean": 0.1015625, + "rewards/equation_reward_func/std": 0.3032590448856354, + "rewards/format_reward_func/mean": 0.9609375, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8035555555555556, + "grad_norm": 0.3458307366778419, + "kl": 0.41552734375, + "learning_rate": 3.4082399632965696e-07, + "loss": 0.0004, + "num_tokens": 33907474.0, + "reward": 0.51953125, + "reward_std": 0.016833597794175148, + "rewards/equation_reward_func/mean": 0.046875, + "rewards/equation_reward_func/std": 0.21220162510871887, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8071111111111111, + "grad_norm": 28.831014334944417, + "kl": 3.250244140625, + "learning_rate": 3.394798620153147e-07, + "loss": 0.0033, + "num_tokens": 34057594.0, + "reward": 0.54296875, + "reward_std": 0.08835469186306, + "rewards/equation_reward_func/mean": 0.1328125, + "rewards/equation_reward_func/std": 0.3407054841518402, + "rewards/format_reward_func/mean": 0.953125, + "rewards/format_reward_func/std": 0.21220162510871887, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8106666666666666, + "grad_norm": 0.6851911505727085, + "kl": 0.3564453125, + "learning_rate": 3.3813275445496766e-07, + "loss": 0.0004, + "num_tokens": 34207678.0, + "reward": 0.49609375, + "reward_std": 0.04808359593153, + "rewards/equation_reward_func/mean": 0.0234375, + "rewards/equation_reward_func/std": 0.15188287198543549, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8142222222222222, + "grad_norm": 4.524751208833203, + "kl": 0.817626953125, + "learning_rate": 3.367827184104437e-07, + "loss": 0.0008, + "num_tokens": 34357638.0, + "reward": 0.5625, + "reward_std": 0.05357225611805916, + "rewards/equation_reward_func/mean": 0.1796875, + "rewards/equation_reward_func/std": 0.3854354918003082, + "rewards/format_reward_func/mean": 0.9453125, + "rewards/format_reward_func/std": 0.22826264798641205, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8177777777777778, + "grad_norm": 0.7657245453002164, + "kl": 0.416748046875, + "learning_rate": 3.354297987408784e-07, + "loss": 0.0004, + "num_tokens": 34507658.0, + "reward": 0.5234375, + "reward_std": 0.078125, + "rewards/equation_reward_func/mean": 0.109375, + "rewards/equation_reward_func/std": 0.31333550810813904, + "rewards/format_reward_func/mean": 0.9375, + "rewards/format_reward_func/std": 0.24301259219646454, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8213333333333334, + "grad_norm": 0.6307999515422514, + "kl": 0.386962890625, + "learning_rate": 3.340740404012251e-07, + "loss": 0.0004, + "num_tokens": 34657738.0, + "reward": 0.56640625, + "reward_std": 0.0546875, + "rewards/equation_reward_func/mean": 0.171875, + "rewards/equation_reward_func/std": 0.3787541687488556, + "rewards/format_reward_func/mean": 0.9609375, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8248888888888889, + "grad_norm": 0.7615234790182357, + "kl": 0.5302734375, + "learning_rate": 3.3271548844076034e-07, + "loss": 0.0005, + "num_tokens": 34807830.0, + "reward": 0.5, + "reward_std": 0.05589609593153, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.9375, + "rewards/format_reward_func/std": 0.24301259219646454, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8284444444444444, + "grad_norm": 45.22177711175558, + "kl": 7.133056640625, + "learning_rate": 3.313541880015877e-07, + "loss": 0.0071, + "num_tokens": 34957886.0, + "reward": 0.49609375, + "reward_std": 0.05710469186306, + "rewards/equation_reward_func/mean": 0.046875, + "rewards/equation_reward_func/std": 0.21220162510871887, + "rewards/format_reward_func/mean": 0.9453125, + "rewards/format_reward_func/std": 0.22826264798641205, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.832, + "grad_norm": 0.4728352153390914, + "kl": 0.421142578125, + "learning_rate": 3.299901843171374e-07, + "loss": 0.0004, + "num_tokens": 35108018.0, + "reward": 0.51953125, + "reward_std": 0.03245859593153, + "rewards/equation_reward_func/mean": 0.0546875, + "rewards/equation_reward_func/std": 0.22826264798641205, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8355555555555556, + "grad_norm": 0.7556394805634985, + "kl": 0.410888671875, + "learning_rate": 3.2862352271066324e-07, + "loss": 0.0004, + "num_tokens": 35258154.0, + "reward": 0.51171875, + "reward_std": 0.06370859593153, + "rewards/equation_reward_func/mean": 0.0703125, + "rewards/equation_reward_func/std": 0.2566775679588318, + "rewards/format_reward_func/mean": 0.953125, + "rewards/format_reward_func/std": 0.21220162510871887, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8391111111111111, + "grad_norm": 60.219344008615124, + "kl": 9.630859375, + "learning_rate": 3.272542485937368e-07, + "loss": 0.0097, + "num_tokens": 35408250.0, + "reward": 0.53515625, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8426666666666667, + "grad_norm": 0.6258122056705749, + "kl": 0.3935546875, + "learning_rate": 3.2588240746473866e-07, + "loss": 0.0004, + "num_tokens": 35558338.0, + "reward": 0.546875, + "reward_std": 0.04400775954127312, + "rewards/equation_reward_func/mean": 0.1328125, + "rewards/equation_reward_func/std": 0.3407054841518402, + "rewards/format_reward_func/mean": 0.9609375, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8462222222222222, + "grad_norm": 0.7556560670668717, + "kl": 0.403564453125, + "learning_rate": 3.245080449073459e-07, + "loss": 0.0004, + "num_tokens": 35708434.0, + "reward": 0.5078125, + "reward_std": 0.07152109593153, + "rewards/equation_reward_func/mean": 0.0703125, + "rewards/equation_reward_func/std": 0.2566775679588318, + "rewards/format_reward_func/mean": 0.9453125, + "rewards/format_reward_func/std": 0.22826264798641205, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8497777777777777, + "grad_norm": 0.6981235833583292, + "kl": 0.39892578125, + "learning_rate": 3.231312065890183e-07, + "loss": 0.0004, + "num_tokens": 35858494.0, + "reward": 0.51953125, + "reward_std": 0.0546875, + "rewards/equation_reward_func/mean": 0.0703125, + "rewards/equation_reward_func/std": 0.2566775679588318, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8533333333333334, + "grad_norm": 0.7057474719839335, + "kl": 0.416748046875, + "learning_rate": 3.217519382594801e-07, + "loss": 0.0004, + "num_tokens": 36008622.0, + "reward": 0.5390625, + "reward_std": 0.05589609593153, + "rewards/equation_reward_func/mean": 0.125, + "rewards/equation_reward_func/std": 0.3320184051990509, + "rewards/format_reward_func/mean": 0.953125, + "rewards/format_reward_func/std": 0.21220162510871887, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8568888888888889, + "grad_norm": 0.6774457062153861, + "kl": 0.3857421875, + "learning_rate": 3.203702857492005e-07, + "loss": 0.0004, + "num_tokens": 36158658.0, + "reward": 0.56640625, + "reward_std": 0.04808359593153, + "rewards/equation_reward_func/mean": 0.15625, + "rewards/equation_reward_func/std": 0.3645188808441162, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8604444444444445, + "grad_norm": 0.8579385643539741, + "kl": 0.399658203125, + "learning_rate": 3.189862949678704e-07, + "loss": 0.0004, + "num_tokens": 36308694.0, + "reward": 0.48046875, + "reward_std": 0.06370859593153, + "rewards/equation_reward_func/mean": 0.0390625, + "rewards/equation_reward_func/std": 0.194504976272583, + "rewards/format_reward_func/mean": 0.921875, + "rewards/format_reward_func/std": 0.2694226801395416, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.864, + "grad_norm": 0.8918776174630627, + "kl": 0.478515625, + "learning_rate": 3.1760001190287695e-07, + "loss": 0.0005, + "num_tokens": 36458806.0, + "reward": 0.50390625, + "reward_std": 0.0859375, + "rewards/equation_reward_func/mean": 0.078125, + "rewards/equation_reward_func/std": 0.2694226801395416, + "rewards/format_reward_func/mean": 0.9296875, + "rewards/format_reward_func/std": 0.2566775679588318, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8675555555555555, + "grad_norm": 14.470212904907461, + "kl": 1.18115234375, + "learning_rate": 3.162114826177756e-07, + "loss": 0.0012, + "num_tokens": 36608858.0, + "reward": 0.5, + "reward_std": 0.049292195588350296, + "rewards/equation_reward_func/mean": 0.03125, + "rewards/equation_reward_func/std": 0.1746762990951538, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8711111111111111, + "grad_norm": 0.8767487112614263, + "kl": 0.48486328125, + "learning_rate": 3.148207532507595e-07, + "loss": 0.0005, + "num_tokens": 36758898.0, + "reward": 0.5546875, + "reward_std": 0.09375, + "rewards/equation_reward_func/mean": 0.140625, + "rewards/equation_reward_func/std": 0.3490002751350403, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8746666666666667, + "grad_norm": 0.7666837173815899, + "kl": 0.53466796875, + "learning_rate": 3.134278700131262e-07, + "loss": 0.0005, + "num_tokens": 36909062.0, + "reward": 0.51953125, + "reward_std": 0.0817507952451706, + "rewards/equation_reward_func/mean": 0.1171875, + "rewards/equation_reward_func/std": 0.322907418012619, + "rewards/format_reward_func/mean": 0.921875, + "rewards/format_reward_func/std": 0.2694226801395416, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8782222222222222, + "grad_norm": 0.4660959690863843, + "kl": 0.361572265625, + "learning_rate": 3.1203287918774224e-07, + "loss": 0.0004, + "num_tokens": 37059178.0, + "reward": 0.50390625, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.0234375, + "rewards/equation_reward_func/std": 0.15188287198543549, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8817777777777778, + "grad_norm": 0.7433620210188535, + "kl": 0.40185546875, + "learning_rate": 3.106358271275056e-07, + "loss": 0.0004, + "num_tokens": 37209262.0, + "reward": 0.484375, + "reward_std": 0.06491719186306, + "rewards/equation_reward_func/mean": 0.03125, + "rewards/equation_reward_func/std": 0.1746762990951538, + "rewards/format_reward_func/mean": 0.9375, + "rewards/format_reward_func/std": 0.24301259219646454, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8853333333333333, + "grad_norm": 1.3225278900078785, + "kl": 0.643310546875, + "learning_rate": 3.0923676025380483e-07, + "loss": 0.0006, + "num_tokens": 37359390.0, + "reward": 0.52734375, + "reward_std": 0.03245859593153, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.9609375, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8888888888888888, + "grad_norm": 1.3565201947424719, + "kl": 0.63818359375, + "learning_rate": 3.078357250549772e-07, + "loss": 0.0006, + "num_tokens": 37509438.0, + "reward": 0.5625, + "reward_std": 0.08054219186306, + "rewards/equation_reward_func/mean": 0.1875, + "rewards/equation_reward_func/std": 0.39184603095054626, + "rewards/format_reward_func/mean": 0.9375, + "rewards/format_reward_func/std": 0.24301259219646454, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8924444444444445, + "grad_norm": 0.7985946426687209, + "kl": 0.533203125, + "learning_rate": 3.064327680847635e-07, + "loss": 0.0005, + "num_tokens": 37659486.0, + "reward": 0.51171875, + "reward_std": 0.0546875, + "rewards/equation_reward_func/mean": 0.0546875, + "rewards/equation_reward_func/std": 0.22826264798641205, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.896, + "grad_norm": 0.7239646058609255, + "kl": 0.391357421875, + "learning_rate": 3.0502793596076136e-07, + "loss": 0.0004, + "num_tokens": 37809642.0, + "reward": 0.546875, + "reward_std": 0.07393828779459, + "rewards/equation_reward_func/mean": 0.140625, + "rewards/equation_reward_func/std": 0.3490002751350403, + "rewards/format_reward_func/mean": 0.953125, + "rewards/format_reward_func/std": 0.21220162510871887, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.8995555555555556, + "grad_norm": 0.7771799102205033, + "kl": 0.45751953125, + "learning_rate": 3.0362127536287636e-07, + "loss": 0.0005, + "num_tokens": 37959778.0, + "reward": 0.48828125, + "reward_std": 0.06370859593153, + "rewards/equation_reward_func/mean": 0.03125, + "rewards/equation_reward_func/std": 0.1746762990951538, + "rewards/format_reward_func/mean": 0.9453125, + "rewards/format_reward_func/std": 0.22826264798641205, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9031111111111111, + "grad_norm": 1.2830628968694793, + "kl": 0.66552734375, + "learning_rate": 3.022128330317705e-07, + "loss": 0.0007, + "num_tokens": 38109842.0, + "reward": 0.56640625, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.15625, + "rewards/equation_reward_func/std": 0.3645188808441162, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9066666666666666, + "grad_norm": 0.5669445746630175, + "kl": 0.4755859375, + "learning_rate": 3.0080265576730977e-07, + "loss": 0.0005, + "num_tokens": 38260030.0, + "reward": 0.48046875, + "reward_std": 0.03245859593153, + "rewards/equation_reward_func/mean": 0.0, + "rewards/equation_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.9609375, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9102222222222223, + "grad_norm": 0.6356374575865582, + "kl": 0.4111328125, + "learning_rate": 2.993907904270084e-07, + "loss": 0.0004, + "num_tokens": 38410070.0, + "reward": 0.49609375, + "reward_std": 0.04808359593153, + "rewards/equation_reward_func/mean": 0.0546875, + "rewards/equation_reward_func/std": 0.22826264798641205, + "rewards/format_reward_func/mean": 0.9375, + "rewards/format_reward_func/std": 0.24301259219646454, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9137777777777778, + "grad_norm": 5.301041165177461, + "kl": 1.978759765625, + "learning_rate": 2.979772839244723e-07, + "loss": 0.002, + "num_tokens": 38560230.0, + "reward": 0.53125, + "reward_std": 0.05589609593153, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9173333333333333, + "grad_norm": 0.6352697465433857, + "kl": 0.384033203125, + "learning_rate": 2.965621832278401e-07, + "loss": 0.0004, + "num_tokens": 38710354.0, + "reward": 0.52734375, + "reward_std": 0.0390625, + "rewards/equation_reward_func/mean": 0.0859375, + "rewards/equation_reward_func/std": 0.2813730239868164, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9208888888888889, + "grad_norm": 0.325841215358069, + "kl": 0.458740234375, + "learning_rate": 2.951455353582224e-07, + "loss": 0.0005, + "num_tokens": 38860410.0, + "reward": 0.53125, + "reward_std": 0.015625, + "rewards/equation_reward_func/mean": 0.0703125, + "rewards/equation_reward_func/std": 0.2566775679588318, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9244444444444444, + "grad_norm": 0.6229004603021032, + "kl": 0.416748046875, + "learning_rate": 2.937273873881396e-07, + "loss": 0.0004, + "num_tokens": 39010430.0, + "reward": 0.53125, + "reward_std": 0.046875, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.928, + "grad_norm": 1.3942727444158145, + "kl": 0.628662109375, + "learning_rate": 2.9230778643995724e-07, + "loss": 0.0006, + "num_tokens": 39160466.0, + "reward": 0.515625, + "reward_std": 0.05589609593153, + "rewards/equation_reward_func/mean": 0.0859375, + "rewards/equation_reward_func/std": 0.2813730239868164, + "rewards/format_reward_func/mean": 0.9453125, + "rewards/format_reward_func/std": 0.22826264798641205, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9315555555555556, + "grad_norm": 0.736925497405656, + "kl": 0.414794921875, + "learning_rate": 2.90886779684321e-07, + "loss": 0.0004, + "num_tokens": 39310574.0, + "reward": 0.53515625, + "reward_std": 0.05710469186306, + "rewards/equation_reward_func/mean": 0.1484375, + "rewards/equation_reward_func/std": 0.356930136680603, + "rewards/format_reward_func/mean": 0.921875, + "rewards/format_reward_func/std": 0.2694226801395416, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9351111111111111, + "grad_norm": 1.3153915670345286, + "kl": 0.77880859375, + "learning_rate": 2.894644143385885e-07, + "loss": 0.0008, + "num_tokens": 39460734.0, + "reward": 0.51953125, + "reward_std": 0.0703125, + "rewards/equation_reward_func/mean": 0.078125, + "rewards/equation_reward_func/std": 0.2694226801395416, + "rewards/format_reward_func/mean": 0.9609375, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9386666666666666, + "grad_norm": 0.3666627577855282, + "kl": 0.4052734375, + "learning_rate": 2.8804073766526095e-07, + "loss": 0.0004, + "num_tokens": 39610814.0, + "reward": 0.54296875, + "reward_std": 0.020570259541273117, + "rewards/equation_reward_func/mean": 0.1015625, + "rewards/equation_reward_func/std": 0.3032590448856354, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9422222222222222, + "grad_norm": 2.316729417862906, + "kl": 1.9755859375, + "learning_rate": 2.866157969704125e-07, + "loss": 0.002, + "num_tokens": 39760914.0, + "reward": 0.53125, + "reward_std": 0.024646097794175148, + "rewards/equation_reward_func/mean": 0.078125, + "rewards/equation_reward_func/std": 0.2694226801395416, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9457777777777778, + "grad_norm": 0.5361641805790162, + "kl": 0.3984375, + "learning_rate": 2.851896396021181e-07, + "loss": 0.0004, + "num_tokens": 39911018.0, + "reward": 0.53515625, + "reward_std": 0.030584799125790596, + "rewards/equation_reward_func/mean": 0.0859375, + "rewards/equation_reward_func/std": 0.2813730239868164, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9493333333333334, + "grad_norm": 2.578024238948283, + "kl": 0.693115234375, + "learning_rate": 2.837623129488808e-07, + "loss": 0.0007, + "num_tokens": 40061098.0, + "reward": 0.51953125, + "reward_std": 0.03245859593153, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9528888888888889, + "grad_norm": 0.5179523249592263, + "kl": 0.380859375, + "learning_rate": 2.823338644380566e-07, + "loss": 0.0004, + "num_tokens": 40211290.0, + "reward": 0.50390625, + "reward_std": 0.03245859593153, + "rewards/equation_reward_func/mean": 0.0390625, + "rewards/equation_reward_func/std": 0.194504976272583, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9564444444444444, + "grad_norm": 0.6716890754838255, + "kl": 0.4169921875, + "learning_rate": 2.809043415342784e-07, + "loss": 0.0004, + "num_tokens": 40361402.0, + "reward": 0.546875, + "reward_std": 0.0625, + "rewards/equation_reward_func/mean": 0.109375, + "rewards/equation_reward_func/std": 0.31333550810813904, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.96, + "grad_norm": 1.8696840139920259, + "kl": 0.512451171875, + "learning_rate": 2.794737917378797e-07, + "loss": 0.0005, + "num_tokens": 40511494.0, + "reward": 0.515625, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9635555555555556, + "grad_norm": 0.5775124536896724, + "kl": 0.4365234375, + "learning_rate": 2.780422625833153e-07, + "loss": 0.0004, + "num_tokens": 40661614.0, + "reward": 0.53125, + "reward_std": 0.04027109593153, + "rewards/equation_reward_func/mean": 0.0859375, + "rewards/equation_reward_func/std": 0.2813730239868164, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9671111111111111, + "grad_norm": 0.5174571918726184, + "kl": 0.40380859375, + "learning_rate": 2.766098016375823e-07, + "loss": 0.0004, + "num_tokens": 40811670.0, + "reward": 0.53125, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.0859375, + "rewards/equation_reward_func/std": 0.2813730239868164, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9706666666666667, + "grad_norm": 10.856193948863917, + "kl": 2.42578125, + "learning_rate": 2.751764564986396e-07, + "loss": 0.0024, + "num_tokens": 40961810.0, + "reward": 0.53125, + "reward_std": 0.04027109593153, + "rewards/equation_reward_func/mean": 0.078125, + "rewards/equation_reward_func/std": 0.2694226801395416, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9742222222222222, + "grad_norm": 0.5252541651736331, + "kl": 0.414306640625, + "learning_rate": 2.737422747938259e-07, + "loss": 0.0004, + "num_tokens": 41111894.0, + "reward": 0.54296875, + "reward_std": 0.0390625, + "rewards/equation_reward_func/mean": 0.1171875, + "rewards/equation_reward_func/std": 0.322907418012619, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9777777777777777, + "grad_norm": 0.584952826070172, + "kl": 0.364501953125, + "learning_rate": 2.723073041782776e-07, + "loss": 0.0004, + "num_tokens": 41262010.0, + "reward": 0.55078125, + "reward_std": 0.0390625, + "rewards/equation_reward_func/mean": 0.1171875, + "rewards/equation_reward_func/std": 0.322907418012619, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9813333333333333, + "grad_norm": 0.6583135007881525, + "kl": 0.427001953125, + "learning_rate": 2.708715923333451e-07, + "loss": 0.0004, + "num_tokens": 41412086.0, + "reward": 0.56640625, + "reward_std": 0.0546875, + "rewards/equation_reward_func/mean": 0.15625, + "rewards/equation_reward_func/std": 0.3645188808441162, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9848888888888889, + "grad_norm": 0.4967388969256047, + "kl": 0.44140625, + "learning_rate": 2.6943518696500835e-07, + "loss": 0.0004, + "num_tokens": 41562150.0, + "reward": 0.51953125, + "reward_std": 0.03245859593153, + "rewards/equation_reward_func/mean": 0.0546875, + "rewards/equation_reward_func/std": 0.22826264798641205, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9884444444444445, + "grad_norm": 0.5621455003149533, + "kl": 0.484619140625, + "learning_rate": 2.6799813580229174e-07, + "loss": 0.0005, + "num_tokens": 41712282.0, + "reward": 0.5625, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.1484375, + "rewards/equation_reward_func/std": 0.356930136680603, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.992, + "grad_norm": 0.4746005261751424, + "kl": 0.3916015625, + "learning_rate": 2.6656048659567834e-07, + "loss": 0.0004, + "num_tokens": 41862370.0, + "reward": 0.5546875, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.1171875, + "rewards/equation_reward_func/std": 0.322907418012619, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9955555555555555, + "grad_norm": 0.535734063401937, + "kl": 0.441162109375, + "learning_rate": 2.65122287115523e-07, + "loss": 0.0004, + "num_tokens": 42012482.0, + "reward": 0.55078125, + "reward_std": 0.03245859593153, + "rewards/equation_reward_func/mean": 0.1328125, + "rewards/equation_reward_func/std": 0.3407054841518402, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.9991111111111111, + "grad_norm": 0.07429460937077817, + "kl": 0.3955078125, + "learning_rate": 2.63683585150465e-07, + "loss": 0.0004, + "num_tokens": 42162534.0, + "reward": 0.53125, + "reward_std": 0.0, + "rewards/equation_reward_func/mean": 0.0703125, + "rewards/equation_reward_func/std": 0.2566775679588318, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0035555555555555, + "grad_norm": 0.8470947965625311, + "kl": 0.468505859375, + "learning_rate": 2.622444285058404e-07, + "loss": 0.0005, + "num_tokens": 42312626.0, + "reward": 0.49609375, + "reward_std": 0.0390625, + "rewards/equation_reward_func/mean": 0.015625, + "rewards/equation_reward_func/std": 0.12450689822435379, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.007111111111111, + "grad_norm": 0.7246290147592683, + "kl": 0.474853515625, + "learning_rate": 2.6080486500209347e-07, + "loss": 0.0005, + "num_tokens": 42462734.0, + "reward": 0.5234375, + "reward_std": 0.046875, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.953125, + "rewards/format_reward_func/std": 0.21220162510871887, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0106666666666666, + "grad_norm": 0.7970433352687321, + "kl": 0.417724609375, + "learning_rate": 2.5936494247318733e-07, + "loss": 0.0004, + "num_tokens": 42612814.0, + "reward": 0.5, + "reward_std": 0.0625, + "rewards/equation_reward_func/mean": 0.0390625, + "rewards/equation_reward_func/std": 0.194504976272583, + "rewards/format_reward_func/mean": 0.9609375, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0142222222222221, + "grad_norm": 0.5963898764489264, + "kl": 0.490478515625, + "learning_rate": 2.5792470876501517e-07, + "loss": 0.0005, + "num_tokens": 42762926.0, + "reward": 0.4921875, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.0078125, + "rewards/equation_reward_func/std": 0.0883883461356163, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9921875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 911.0, + "completions/mean_length": 1023.1171875, + "completions/mean_terminated_length": 911.0, + "completions/min_length": 911.0, + "completions/min_terminated_length": 911.0, + "epoch": 1.0177777777777777, + "grad_norm": 0.7142379802123127, + "kl": 0.412841796875, + "learning_rate": 2.5648421173380974e-07, + "loss": -0.0009, + "num_tokens": 42912905.0, + "reward": 0.51171875, + "reward_std": 0.057104695588350296, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.9609375, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0213333333333334, + "grad_norm": 0.6099036684005931, + "kl": 0.409912109375, + "learning_rate": 2.550434992445538e-07, + "loss": 0.0004, + "num_tokens": 43062965.0, + "reward": 0.5390625, + "reward_std": 0.053028859198093414, + "rewards/equation_reward_func/mean": 0.1328125, + "rewards/equation_reward_func/std": 0.3407054841518402, + "rewards/format_reward_func/mean": 0.9453125, + "rewards/format_reward_func/std": 0.22826264798641205, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.024888888888889, + "grad_norm": 0.4520069380770881, + "kl": 0.437255859375, + "learning_rate": 2.536026191693893e-07, + "loss": 0.0004, + "num_tokens": 43213025.0, + "reward": 0.5390625, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.0859375, + "rewards/equation_reward_func/std": 0.2813730239868164, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0284444444444445, + "grad_norm": 0.6319811742197129, + "kl": 0.369873046875, + "learning_rate": 2.521616193860266e-07, + "loss": 0.0004, + "num_tokens": 43363137.0, + "reward": 0.50390625, + "reward_std": 0.041479695588350296, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.9453125, + "rewards/format_reward_func/std": 0.22826264798641205, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.032, + "grad_norm": 2.6591058310513374, + "kl": 0.842041015625, + "learning_rate": 2.507205477761539e-07, + "loss": 0.0008, + "num_tokens": 43513221.0, + "reward": 0.515625, + "reward_std": 0.04027109593153, + "rewards/equation_reward_func/mean": 0.0703125, + "rewards/equation_reward_func/std": 0.2566775679588318, + "rewards/format_reward_func/mean": 0.9609375, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0355555555555556, + "grad_norm": 0.5420344680148037, + "kl": 0.37548828125, + "learning_rate": 2.4927945222384613e-07, + "loss": 0.0004, + "num_tokens": 43663345.0, + "reward": 0.52734375, + "reward_std": 0.0390625, + "rewards/equation_reward_func/mean": 0.0703125, + "rewards/equation_reward_func/std": 0.2566775679588318, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.039111111111111, + "grad_norm": 0.5473848935681827, + "kl": 0.51171875, + "learning_rate": 2.4783838061397334e-07, + "loss": 0.0005, + "num_tokens": 43813417.0, + "reward": 0.515625, + "reward_std": 0.0, + "rewards/equation_reward_func/mean": 0.03125, + "rewards/equation_reward_func/std": 0.1746762990951538, + "rewards/format_reward_func/mean": 1.0, + "rewards/format_reward_func/std": 0.0, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0426666666666666, + "grad_norm": 0.7008917634695417, + "kl": 0.634765625, + "learning_rate": 2.4639738083061073e-07, + "loss": 0.0006, + "num_tokens": 43963469.0, + "reward": 0.51171875, + "reward_std": 0.03245859593153, + "rewards/equation_reward_func/mean": 0.046875, + "rewards/equation_reward_func/std": 0.21220162510871887, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0462222222222222, + "grad_norm": 0.3295873056459611, + "kl": 0.4091796875, + "learning_rate": 2.4495650075544613e-07, + "loss": 0.0004, + "num_tokens": 44113521.0, + "reward": 0.53125, + "reward_std": 0.015625, + "rewards/equation_reward_func/mean": 0.0703125, + "rewards/equation_reward_func/std": 0.2566775679588318, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0497777777777777, + "grad_norm": 0.5414055832707978, + "kl": 0.39501953125, + "learning_rate": 2.435157882661903e-07, + "loss": 0.0004, + "num_tokens": 44263553.0, + "reward": 0.54296875, + "reward_std": 0.03245859593153, + "rewards/equation_reward_func/mean": 0.109375, + "rewards/equation_reward_func/std": 0.31333550810813904, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0533333333333332, + "grad_norm": 0.4682537399126061, + "kl": 0.397705078125, + "learning_rate": 2.420752912349848e-07, + "loss": 0.0004, + "num_tokens": 44413705.0, + "reward": 0.53515625, + "reward_std": 0.045216359198093414, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.056888888888889, + "grad_norm": 0.922022274288094, + "kl": 0.557861328125, + "learning_rate": 2.4063505752681265e-07, + "loss": 0.0006, + "num_tokens": 44563845.0, + "reward": 0.46875, + "reward_std": 0.06491719186306, + "rewards/equation_reward_func/mean": 0.015625, + "rewards/equation_reward_func/std": 0.12450689822435379, + "rewards/format_reward_func/mean": 0.921875, + "rewards/format_reward_func/std": 0.2694226801395416, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0604444444444445, + "grad_norm": 0.2921901386740173, + "kl": 0.396484375, + "learning_rate": 2.3919513499790646e-07, + "loss": 0.0004, + "num_tokens": 44713933.0, + "reward": 0.5625, + "reward_std": 0.018042195588350296, + "rewards/equation_reward_func/mean": 0.140625, + "rewards/equation_reward_func/std": 0.3490002751350403, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.064, + "grad_norm": 0.45593720122261916, + "kl": 0.45068359375, + "learning_rate": 2.3775557149415953e-07, + "loss": 0.0005, + "num_tokens": 44863933.0, + "reward": 0.5625, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.1328125, + "rewards/equation_reward_func/std": 0.3407054841518402, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0675555555555556, + "grad_norm": 0.37302671430025486, + "kl": 0.4716796875, + "learning_rate": 2.3631641484953493e-07, + "loss": 0.0005, + "num_tokens": 45014049.0, + "reward": 0.5390625, + "reward_std": 0.009021097794175148, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0711111111111111, + "grad_norm": 0.39290892679689327, + "kl": 0.373046875, + "learning_rate": 2.3487771288447703e-07, + "loss": 0.0004, + "num_tokens": 45164101.0, + "reward": 0.53515625, + "reward_std": 0.0390625, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0746666666666667, + "grad_norm": 0.7056356953892552, + "kl": 0.4267578125, + "learning_rate": 2.3343951340432158e-07, + "loss": 0.0004, + "num_tokens": 45314165.0, + "reward": 0.51953125, + "reward_std": 0.06744526326656342, + "rewards/equation_reward_func/mean": 0.0859375, + "rewards/equation_reward_func/std": 0.2813730239868164, + "rewards/format_reward_func/mean": 0.953125, + "rewards/format_reward_func/std": 0.21220162510871887, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0782222222222222, + "grad_norm": 0.5027562199597438, + "kl": 0.489501953125, + "learning_rate": 2.3200186419770823e-07, + "loss": 0.0005, + "num_tokens": 45464281.0, + "reward": 0.51953125, + "reward_std": 0.0390625, + "rewards/equation_reward_func/mean": 0.0546875, + "rewards/equation_reward_func/std": 0.22826264798641205, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0817777777777777, + "grad_norm": 0.2975548497811407, + "kl": 0.385498046875, + "learning_rate": 2.3056481303499163e-07, + "loss": 0.0004, + "num_tokens": 45614357.0, + "reward": 0.5390625, + "reward_std": 0.015625, + "rewards/equation_reward_func/mean": 0.078125, + "rewards/equation_reward_func/std": 0.2694226801395416, + "rewards/format_reward_func/mean": 1.0, + "rewards/format_reward_func/std": 0.0, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0853333333333333, + "grad_norm": 0.43699072382589177, + "kl": 0.414306640625, + "learning_rate": 2.291284076666549e-07, + "loss": 0.0004, + "num_tokens": 45764381.0, + "reward": 0.5390625, + "reward_std": 0.024646097794175148, + "rewards/equation_reward_func/mean": 0.0859375, + "rewards/equation_reward_func/std": 0.2813730239868164, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0888888888888888, + "grad_norm": 0.7264220978249349, + "kl": 0.537109375, + "learning_rate": 2.2769269582172236e-07, + "loss": 0.0005, + "num_tokens": 45914405.0, + "reward": 0.52734375, + "reward_std": 0.0078125, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0924444444444443, + "grad_norm": 0.8246989232799073, + "kl": 0.415771484375, + "learning_rate": 2.262577252061741e-07, + "loss": 0.0004, + "num_tokens": 46064497.0, + "reward": 0.56640625, + "reward_std": 0.07933359593153, + "rewards/equation_reward_func/mean": 0.1875, + "rewards/equation_reward_func/std": 0.39184603095054626, + "rewards/format_reward_func/mean": 0.9453125, + "rewards/format_reward_func/std": 0.22826264798641205, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.096, + "grad_norm": 1.5117707972901218, + "kl": 0.685791015625, + "learning_rate": 2.2482354350136043e-07, + "loss": 0.0007, + "num_tokens": 46214529.0, + "reward": 0.55078125, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.1171875, + "rewards/equation_reward_func/std": 0.322907418012619, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.0995555555555556, + "grad_norm": 0.4384748657076267, + "kl": 0.436279296875, + "learning_rate": 2.2339019836241768e-07, + "loss": 0.0004, + "num_tokens": 46364633.0, + "reward": 0.51953125, + "reward_std": 0.016833597794175148, + "rewards/equation_reward_func/mean": 0.046875, + "rewards/equation_reward_func/std": 0.21220162510871887, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1031111111111112, + "grad_norm": 0.5367303888744025, + "kl": 0.478515625, + "learning_rate": 2.219577374166847e-07, + "loss": 0.0005, + "num_tokens": 46514697.0, + "reward": 0.5078125, + "reward_std": 0.03839729726314545, + "rewards/equation_reward_func/mean": 0.03125, + "rewards/equation_reward_func/std": 0.1746762990951538, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1066666666666667, + "grad_norm": 0.5858380888629433, + "kl": 0.43408203125, + "learning_rate": 2.2052620826212031e-07, + "loss": 0.0004, + "num_tokens": 46664821.0, + "reward": 0.51953125, + "reward_std": 0.0390625, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1102222222222222, + "grad_norm": 0.641916377352643, + "kl": 0.55615234375, + "learning_rate": 2.1909565846572158e-07, + "loss": 0.0006, + "num_tokens": 46814953.0, + "reward": 0.48828125, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.0, + "rewards/equation_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1137777777777778, + "grad_norm": 0.33090250971908225, + "kl": 0.41796875, + "learning_rate": 2.1766613556194344e-07, + "loss": 0.0004, + "num_tokens": 46964997.0, + "reward": 0.578125, + "reward_std": 0.024646097794175148, + "rewards/equation_reward_func/mean": 0.1640625, + "rewards/equation_reward_func/std": 0.371787428855896, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1173333333333333, + "grad_norm": 15.61725314528695, + "kl": 3.2099609375, + "learning_rate": 2.1623768705111914e-07, + "loss": 0.0032, + "num_tokens": 47115081.0, + "reward": 0.51171875, + "reward_std": 0.0390625, + "rewards/equation_reward_func/mean": 0.0390625, + "rewards/equation_reward_func/std": 0.194504976272583, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1208888888888888, + "grad_norm": 0.7107097215021569, + "kl": 0.523681640625, + "learning_rate": 2.1481036039788185e-07, + "loss": 0.0005, + "num_tokens": 47265145.0, + "reward": 0.57421875, + "reward_std": 0.0546875, + "rewards/equation_reward_func/mean": 0.1796875, + "rewards/equation_reward_func/std": 0.3854354918003082, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1244444444444444, + "grad_norm": 0.5636256684100517, + "kl": 0.508544921875, + "learning_rate": 2.133842030295875e-07, + "loss": 0.0005, + "num_tokens": 47415337.0, + "reward": 0.51171875, + "reward_std": 0.0390625, + "rewards/equation_reward_func/mean": 0.0546875, + "rewards/equation_reward_func/std": 0.22826264798641205, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1280000000000001, + "grad_norm": 0.5116667035209906, + "kl": 0.435791015625, + "learning_rate": 2.1195926233473905e-07, + "loss": 0.0004, + "num_tokens": 47565405.0, + "reward": 0.55859375, + "reward_std": 0.03245859593153, + "rewards/equation_reward_func/mean": 0.1328125, + "rewards/equation_reward_func/std": 0.3407054841518402, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1315555555555556, + "grad_norm": 0.6779613408018799, + "kl": 0.4248046875, + "learning_rate": 2.105355856614115e-07, + "loss": 0.0004, + "num_tokens": 47715461.0, + "reward": 0.5546875, + "reward_std": 0.05589609593153, + "rewards/equation_reward_func/mean": 0.125, + "rewards/equation_reward_func/std": 0.3320184051990509, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1351111111111112, + "grad_norm": 0.3921549457589845, + "kl": 0.412353515625, + "learning_rate": 2.0911322031567907e-07, + "loss": 0.0004, + "num_tokens": 47865549.0, + "reward": 0.53125, + "reward_std": 0.024646097794175148, + "rewards/equation_reward_func/mean": 0.0703125, + "rewards/equation_reward_func/std": 0.2566775679588318, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1386666666666667, + "grad_norm": 0.4958321729986366, + "kl": 0.4521484375, + "learning_rate": 2.076922135600427e-07, + "loss": 0.0005, + "num_tokens": 48015629.0, + "reward": 0.5390625, + "reward_std": 0.015625, + "rewards/equation_reward_func/mean": 0.0859375, + "rewards/equation_reward_func/std": 0.2813730239868164, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1422222222222222, + "grad_norm": 0.7423549988863264, + "kl": 0.406005859375, + "learning_rate": 2.0627261261186048e-07, + "loss": 0.0004, + "num_tokens": 48165705.0, + "reward": 0.55078125, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.125, + "rewards/equation_reward_func/std": 0.3320184051990509, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1457777777777778, + "grad_norm": 0.22087517892930575, + "kl": 0.41162109375, + "learning_rate": 2.0485446464177752e-07, + "loss": 0.0004, + "num_tokens": 48315833.0, + "reward": 0.54296875, + "reward_std": 0.0078125, + "rewards/equation_reward_func/mean": 0.0859375, + "rewards/equation_reward_func/std": 0.2813730239868164, + "rewards/format_reward_func/mean": 1.0, + "rewards/format_reward_func/std": 0.0, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1493333333333333, + "grad_norm": 0.9688657800607915, + "kl": 0.743408203125, + "learning_rate": 2.034378167721599e-07, + "loss": 0.0007, + "num_tokens": 48465977.0, + "reward": 0.5234375, + "reward_std": 0.046875, + "rewards/equation_reward_func/mean": 0.078125, + "rewards/equation_reward_func/std": 0.2694226801395416, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1528888888888889, + "grad_norm": 0.5069073201298848, + "kl": 0.3955078125, + "learning_rate": 2.0202271607552766e-07, + "loss": 0.0004, + "num_tokens": 48616049.0, + "reward": 0.51953125, + "reward_std": 0.0390625, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1564444444444444, + "grad_norm": 0.30676846934679053, + "kl": 0.37060546875, + "learning_rate": 2.006092095729916e-07, + "loss": 0.0004, + "num_tokens": 48766117.0, + "reward": 0.52734375, + "reward_std": 0.0078125, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.16, + "grad_norm": 8.64784262784224, + "kl": 1.320556640625, + "learning_rate": 1.9919734423269018e-07, + "loss": 0.0013, + "num_tokens": 48916213.0, + "reward": 0.53125, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1635555555555555, + "grad_norm": 0.5177358301467603, + "kl": 0.4267578125, + "learning_rate": 1.9778716696822948e-07, + "loss": 0.0004, + "num_tokens": 49066345.0, + "reward": 0.546875, + "reward_std": 0.04027109593153, + "rewards/equation_reward_func/mean": 0.1171875, + "rewards/equation_reward_func/std": 0.322907418012619, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1671111111111112, + "grad_norm": 0.29160214419010005, + "kl": 0.4140625, + "learning_rate": 1.9637872463712362e-07, + "loss": 0.0004, + "num_tokens": 49216417.0, + "reward": 0.53125, + "reward_std": 0.015625, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 1.0, + "rewards/format_reward_func/std": 0.0, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1706666666666667, + "grad_norm": 0.37306669591193975, + "kl": 0.389404296875, + "learning_rate": 1.9497206403923864e-07, + "loss": 0.0004, + "num_tokens": 49366501.0, + "reward": 0.546875, + "reward_std": 0.024646097794175148, + "rewards/equation_reward_func/mean": 0.1015625, + "rewards/equation_reward_func/std": 0.3032590448856354, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1742222222222223, + "grad_norm": 0.3457412118045277, + "kl": 0.439697265625, + "learning_rate": 1.9356723191523646e-07, + "loss": 0.0004, + "num_tokens": 49516501.0, + "reward": 0.515625, + "reward_std": 0.015625, + "rewards/equation_reward_func/mean": 0.0390625, + "rewards/equation_reward_func/std": 0.194504976272583, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1777777777777778, + "grad_norm": 0.49964396837602626, + "kl": 0.417236328125, + "learning_rate": 1.921642749450228e-07, + "loss": 0.0004, + "num_tokens": 49666537.0, + "reward": 0.609375, + "reward_std": 0.04027109593153, + "rewards/equation_reward_func/mean": 0.2265625, + "rewards/equation_reward_func/std": 0.4202519655227661, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1813333333333333, + "grad_norm": 0.4051353842023595, + "kl": 0.473388671875, + "learning_rate": 1.9076323974619512e-07, + "loss": 0.0005, + "num_tokens": 49816629.0, + "reward": 0.5703125, + "reward_std": 0.015625, + "rewards/equation_reward_func/mean": 0.1484375, + "rewards/equation_reward_func/std": 0.356930136680603, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1848888888888889, + "grad_norm": 5.378834991563153, + "kl": 1.024658203125, + "learning_rate": 1.8936417287249446e-07, + "loss": 0.001, + "num_tokens": 49966761.0, + "reward": 0.5390625, + "reward_std": 0.015625, + "rewards/equation_reward_func/mean": 0.0859375, + "rewards/equation_reward_func/std": 0.2813730239868164, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1884444444444444, + "grad_norm": 0.6720029560481326, + "kl": 0.527099609375, + "learning_rate": 1.8796712081225774e-07, + "loss": 0.0005, + "num_tokens": 50116905.0, + "reward": 0.55859375, + "reward_std": 0.03245859593153, + "rewards/equation_reward_func/mean": 0.125, + "rewards/equation_reward_func/std": 0.3320184051990509, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9921875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 834.0, + "completions/mean_length": 1022.515625, + "completions/mean_terminated_length": 834.0, + "completions/min_length": 834.0, + "completions/min_terminated_length": 834.0, + "epoch": 1.192, + "grad_norm": 0.6578900739907899, + "kl": 0.492431640625, + "learning_rate": 1.8657212998687388e-07, + "loss": 0.0005, + "num_tokens": 50266775.0, + "reward": 0.56640625, + "reward_std": 0.03245859593153, + "rewards/equation_reward_func/mean": 0.1484375, + "rewards/equation_reward_func/std": 0.356930136680603, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.1955555555555555, + "grad_norm": 0.5860458852325504, + "kl": 0.45849609375, + "learning_rate": 1.8517924674924046e-07, + "loss": 0.0005, + "num_tokens": 50416907.0, + "reward": 0.515625, + "reward_std": 0.04027109593153, + "rewards/equation_reward_func/mean": 0.0546875, + "rewards/equation_reward_func/std": 0.22826264798641205, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.199111111111111, + "grad_norm": 0.5764488924096699, + "kl": 0.3935546875, + "learning_rate": 1.8378851738222439e-07, + "loss": 0.0004, + "num_tokens": 50566947.0, + "reward": 0.5625, + "reward_std": 0.046875, + "rewards/equation_reward_func/mean": 0.1328125, + "rewards/equation_reward_func/std": 0.3407054841518402, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.2026666666666666, + "grad_norm": 0.3101655260933977, + "kl": 0.466552734375, + "learning_rate": 1.82399988097123e-07, + "loss": 0.0005, + "num_tokens": 50717059.0, + "reward": 0.54296875, + "reward_std": 0.0078125, + "rewards/equation_reward_func/mean": 0.0859375, + "rewards/equation_reward_func/std": 0.2813730239868164, + "rewards/format_reward_func/mean": 1.0, + "rewards/format_reward_func/std": 0.0, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.2062222222222223, + "grad_norm": 0.25806921540657135, + "kl": 0.427001953125, + "learning_rate": 1.8101370503212962e-07, + "loss": 0.0004, + "num_tokens": 50867127.0, + "reward": 0.54296875, + "reward_std": 0.0078125, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.2097777777777778, + "grad_norm": 0.38683952780832426, + "kl": 0.398193359375, + "learning_rate": 1.7962971425079946e-07, + "loss": 0.0004, + "num_tokens": 51017191.0, + "reward": 0.55859375, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.125, + "rewards/equation_reward_func/std": 0.3320184051990509, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.2133333333333334, + "grad_norm": 0.43762529890937657, + "kl": 0.40673828125, + "learning_rate": 1.7824806174051994e-07, + "loss": 0.0004, + "num_tokens": 51167303.0, + "reward": 0.55078125, + "reward_std": 0.03245859593153, + "rewards/equation_reward_func/mean": 0.1171875, + "rewards/equation_reward_func/std": 0.322907418012619, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.216888888888889, + "grad_norm": 0.4947585618325522, + "kl": 0.500732421875, + "learning_rate": 1.7686879341098172e-07, + "loss": 0.0005, + "num_tokens": 51317335.0, + "reward": 0.5546875, + "reward_std": 0.033667195588350296, + "rewards/equation_reward_func/mean": 0.1171875, + "rewards/equation_reward_func/std": 0.322907418012619, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.2204444444444444, + "grad_norm": 0.29648752247386784, + "kl": 0.454345703125, + "learning_rate": 1.7549195509265407e-07, + "loss": 0.0005, + "num_tokens": 51467455.0, + "reward": 0.52734375, + "reward_std": 0.0078125, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.224, + "grad_norm": 0.3522267627578455, + "kl": 0.41064453125, + "learning_rate": 1.7411759253526137e-07, + "loss": 0.0004, + "num_tokens": 51617551.0, + "reward": 0.5546875, + "reward_std": 0.015625, + "rewards/equation_reward_func/mean": 0.109375, + "rewards/equation_reward_func/std": 0.31333550810813904, + "rewards/format_reward_func/mean": 1.0, + "rewards/format_reward_func/std": 0.0, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.2275555555555555, + "grad_norm": 3.809079002025788, + "kl": 1.059814453125, + "learning_rate": 1.7274575140626315e-07, + "loss": 0.0011, + "num_tokens": 51767667.0, + "reward": 0.55078125, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.1171875, + "rewards/equation_reward_func/std": 0.322907418012619, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.231111111111111, + "grad_norm": 0.20335882404499606, + "kl": 0.433837890625, + "learning_rate": 1.713764772893368e-07, + "loss": 0.0004, + "num_tokens": 51917775.0, + "reward": 0.55078125, + "reward_std": 0.0078125, + "rewards/equation_reward_func/mean": 0.1015625, + "rewards/equation_reward_func/std": 0.3032590448856354, + "rewards/format_reward_func/mean": 1.0, + "rewards/format_reward_func/std": 0.0, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.2346666666666666, + "grad_norm": 0.40351519211548675, + "kl": 0.44384765625, + "learning_rate": 1.7000981568286263e-07, + "loss": 0.0004, + "num_tokens": 52067895.0, + "reward": 0.546875, + "reward_std": 0.024646097794175148, + "rewards/equation_reward_func/mean": 0.109375, + "rewards/equation_reward_func/std": 0.31333550810813904, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.2382222222222223, + "grad_norm": 0.5942030016843242, + "kl": 0.45703125, + "learning_rate": 1.6864581199841226e-07, + "loss": 0.0005, + "num_tokens": 52218003.0, + "reward": 0.51171875, + "reward_std": 0.0390625, + "rewards/equation_reward_func/mean": 0.046875, + "rewards/equation_reward_func/std": 0.21220162510871887, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.2417777777777779, + "grad_norm": 0.529993570144883, + "kl": 0.404052734375, + "learning_rate": 1.6728451155923966e-07, + "loss": 0.0004, + "num_tokens": 52368071.0, + "reward": 0.51953125, + "reward_std": 0.0390625, + "rewards/equation_reward_func/mean": 0.0703125, + "rewards/equation_reward_func/std": 0.2566775679588318, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.2453333333333334, + "grad_norm": 0.8041631283434129, + "kl": 0.8837890625, + "learning_rate": 1.6592595959877493e-07, + "loss": 0.0009, + "num_tokens": 52518159.0, + "reward": 0.546875, + "reward_std": 0.015625, + "rewards/equation_reward_func/mean": 0.1015625, + "rewards/equation_reward_func/std": 0.3032590448856354, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.248888888888889, + "grad_norm": 1.6872516484862146, + "kl": 0.507080078125, + "learning_rate": 1.6457020125912158e-07, + "loss": 0.0005, + "num_tokens": 52668287.0, + "reward": 0.51171875, + "reward_std": 0.03245859593153, + "rewards/equation_reward_func/mean": 0.046875, + "rewards/equation_reward_func/std": 0.21220162510871887, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.2524444444444445, + "grad_norm": 0.6429164573265299, + "kl": 0.455810546875, + "learning_rate": 1.6321728158955633e-07, + "loss": 0.0005, + "num_tokens": 52818387.0, + "reward": 0.5703125, + "reward_std": 0.04027109593153, + "rewards/equation_reward_func/mean": 0.15625, + "rewards/equation_reward_func/std": 0.3645188808441162, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.256, + "grad_norm": 0.6468681508234729, + "kl": 0.43701171875, + "learning_rate": 1.6186724554503237e-07, + "loss": 0.0004, + "num_tokens": 52968427.0, + "reward": 0.51171875, + "reward_std": 0.055230896919965744, + "rewards/equation_reward_func/mean": 0.078125, + "rewards/equation_reward_func/std": 0.2694226801395416, + "rewards/format_reward_func/mean": 0.9453125, + "rewards/format_reward_func/std": 0.22826264798641205, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.2595555555555555, + "grad_norm": 0.44088485589071047, + "kl": 0.397216796875, + "learning_rate": 1.6052013798468528e-07, + "loss": 0.0004, + "num_tokens": 53118535.0, + "reward": 0.51171875, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.03125, + "rewards/equation_reward_func/std": 0.1746762990951538, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.263111111111111, + "grad_norm": 0.3945491441045777, + "kl": 0.38134765625, + "learning_rate": 1.5917600367034302e-07, + "loss": 0.0004, + "num_tokens": 53268763.0, + "reward": 0.4921875, + "reward_std": 0.015625, + "rewards/equation_reward_func/mean": 0.0, + "rewards/equation_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.2666666666666666, + "grad_norm": 0.638915571522292, + "kl": 0.408447265625, + "learning_rate": 1.578348872650378e-07, + "loss": 0.0004, + "num_tokens": 53418919.0, + "reward": 0.48828125, + "reward_std": 0.0546875, + "rewards/equation_reward_func/mean": 0.015625, + "rewards/equation_reward_func/std": 0.12450689822435379, + "rewards/format_reward_func/mean": 0.9609375, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.2702222222222221, + "grad_norm": 0.5121027616204278, + "kl": 0.429931640625, + "learning_rate": 1.564968333315229e-07, + "loss": 0.0004, + "num_tokens": 53568999.0, + "reward": 0.55859375, + "reward_std": 0.04808359593153, + "rewards/equation_reward_func/mean": 0.140625, + "rewards/equation_reward_func/std": 0.3490002751350403, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.2737777777777777, + "grad_norm": 0.5563613028742419, + "kl": 0.509521484375, + "learning_rate": 1.5516188633079107e-07, + "loss": 0.0005, + "num_tokens": 53719115.0, + "reward": 0.51953125, + "reward_std": 0.04147969186306, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.9453125, + "rewards/format_reward_func/std": 0.22826264798641205, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.2773333333333334, + "grad_norm": 0.5962361306068441, + "kl": 0.387939453125, + "learning_rate": 1.5383009062059794e-07, + "loss": 0.0004, + "num_tokens": 53869339.0, + "reward": 0.52734375, + "reward_std": 0.0390625, + "rewards/equation_reward_func/mean": 0.078125, + "rewards/equation_reward_func/std": 0.2694226801395416, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.280888888888889, + "grad_norm": 0.8520264003008496, + "kl": 0.6318359375, + "learning_rate": 1.525014904539873e-07, + "loss": 0.0006, + "num_tokens": 54019455.0, + "reward": 0.53125, + "reward_std": 0.0, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 1.0, + "rewards/format_reward_func/std": 0.0, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.2844444444444445, + "grad_norm": 0.4853809422457624, + "kl": 0.45458984375, + "learning_rate": 1.5117612997782158e-07, + "loss": 0.0005, + "num_tokens": 54169571.0, + "reward": 0.52734375, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.0703125, + "rewards/equation_reward_func/std": 0.2566775679588318, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.288, + "grad_norm": 0.3747330328198201, + "kl": 0.42333984375, + "learning_rate": 1.49854053231314e-07, + "loss": 0.0004, + "num_tokens": 54319603.0, + "reward": 0.546875, + "reward_std": 0.024646097794175148, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 1.0, + "rewards/format_reward_func/std": 0.0, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.2915555555555556, + "grad_norm": 0.5492601505442783, + "kl": 0.47265625, + "learning_rate": 1.4853530414456612e-07, + "loss": 0.0005, + "num_tokens": 54469607.0, + "reward": 0.52734375, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.295111111111111, + "grad_norm": 0.33780973836896694, + "kl": 0.455810546875, + "learning_rate": 1.4721992653710718e-07, + "loss": 0.0005, + "num_tokens": 54619703.0, + "reward": 0.52734375, + "reward_std": 0.0078125, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.2986666666666666, + "grad_norm": 0.5018740934387684, + "kl": 0.458251953125, + "learning_rate": 1.45907964116439e-07, + "loss": 0.0005, + "num_tokens": 54769755.0, + "reward": 0.5703125, + "reward_std": 0.04027109593153, + "rewards/equation_reward_func/mean": 0.171875, + "rewards/equation_reward_func/std": 0.3787541687488556, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.3022222222222222, + "grad_norm": 0.48892083551158816, + "kl": 0.4052734375, + "learning_rate": 1.4459946047658305e-07, + "loss": 0.0004, + "num_tokens": 54919835.0, + "reward": 0.5234375, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.0546875, + "rewards/equation_reward_func/std": 0.22826264798641205, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.3057777777777777, + "grad_norm": 0.35313257655187524, + "kl": 0.4345703125, + "learning_rate": 1.4329445909663194e-07, + "loss": 0.0004, + "num_tokens": 55069915.0, + "reward": 0.53125, + "reward_std": 0.015625, + "rewards/equation_reward_func/mean": 0.0703125, + "rewards/equation_reward_func/std": 0.2566775679588318, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.3093333333333335, + "grad_norm": 1.3640081478037656, + "kl": 0.619384765625, + "learning_rate": 1.4199300333930515e-07, + "loss": 0.0006, + "num_tokens": 55219987.0, + "reward": 0.49609375, + "reward_std": 0.04620979726314545, + "rewards/equation_reward_func/mean": 0.0234375, + "rewards/equation_reward_func/std": 0.15188287198543549, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.3128888888888888, + "grad_norm": 0.39606833878919273, + "kl": 0.41650390625, + "learning_rate": 1.4069513644950744e-07, + "loss": 0.0004, + "num_tokens": 55370151.0, + "reward": 0.4921875, + "reward_std": 0.015625, + "rewards/equation_reward_func/mean": 0.0, + "rewards/equation_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9921875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 743.0, + "completions/mean_length": 1021.8046875, + "completions/mean_terminated_length": 743.0, + "completions/min_length": 743.0, + "completions/min_terminated_length": 743.0, + "epoch": 1.3164444444444445, + "grad_norm": 0.44470406838575705, + "kl": 0.40087890625, + "learning_rate": 1.394009015528927e-07, + "loss": 0.0004, + "num_tokens": 55520026.0, + "reward": 0.52734375, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.0703125, + "rewards/equation_reward_func/std": 0.2566775679588318, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.32, + "grad_norm": 0.3552220431069685, + "kl": 0.37841796875, + "learning_rate": 1.3811034165443036e-07, + "loss": 0.0004, + "num_tokens": 55670030.0, + "reward": 0.53515625, + "reward_std": 0.025854695588350296, + "rewards/equation_reward_func/mean": 0.0859375, + "rewards/equation_reward_func/std": 0.2813730239868164, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.3235555555555556, + "grad_norm": 0.44156036308113156, + "kl": 0.417724609375, + "learning_rate": 1.3682349963697676e-07, + "loss": 0.0004, + "num_tokens": 55820110.0, + "reward": 0.48828125, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.0, + "rewards/equation_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.3271111111111111, + "grad_norm": 0.8049133934652881, + "kl": 0.50244140625, + "learning_rate": 1.3554041825985e-07, + "loss": 0.0005, + "num_tokens": 55970202.0, + "reward": 0.5, + "reward_std": 0.09616719186306, + "rewards/equation_reward_func/mean": 0.0546875, + "rewards/equation_reward_func/std": 0.22826264798641205, + "rewards/format_reward_func/mean": 0.9453125, + "rewards/format_reward_func/std": 0.22826264798641205, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.3306666666666667, + "grad_norm": 0.4029732452394675, + "kl": 0.392333984375, + "learning_rate": 1.3426114015740915e-07, + "loss": 0.0004, + "num_tokens": 56120286.0, + "reward": 0.5703125, + "reward_std": 0.024646097794175148, + "rewards/equation_reward_func/mean": 0.1484375, + "rewards/equation_reward_func/std": 0.356930136680603, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.3342222222222222, + "grad_norm": 0.6289438946544794, + "kl": 0.447021484375, + "learning_rate": 1.3298570783763805e-07, + "loss": 0.0004, + "num_tokens": 56270414.0, + "reward": 0.57421875, + "reward_std": 0.0078125, + "rewards/equation_reward_func/mean": 0.1484375, + "rewards/equation_reward_func/std": 0.356930136680603, + "rewards/format_reward_func/mean": 1.0, + "rewards/format_reward_func/std": 0.0, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.3377777777777777, + "grad_norm": 0.5979204566593044, + "kl": 0.445556640625, + "learning_rate": 1.31714163680732e-07, + "loss": 0.0004, + "num_tokens": 56420570.0, + "reward": 0.51171875, + "reward_std": 0.0546875, + "rewards/equation_reward_func/mean": 0.046875, + "rewards/equation_reward_func/std": 0.21220162510871887, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.3413333333333333, + "grad_norm": 0.5602574601449087, + "kl": 0.440185546875, + "learning_rate": 1.3044654993769044e-07, + "loss": 0.0004, + "num_tokens": 56570618.0, + "reward": 0.5859375, + "reward_std": 0.046875, + "rewards/equation_reward_func/mean": 0.1953125, + "rewards/equation_reward_func/std": 0.3979988098144531, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.3448888888888888, + "grad_norm": 0.584595173061934, + "kl": 0.387451171875, + "learning_rate": 1.2918290872891236e-07, + "loss": 0.0004, + "num_tokens": 56720726.0, + "reward": 0.55078125, + "reward_std": 0.0390625, + "rewards/equation_reward_func/mean": 0.1171875, + "rewards/equation_reward_func/std": 0.322907418012619, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.3484444444444446, + "grad_norm": 0.38195153284699324, + "kl": 0.42041015625, + "learning_rate": 1.2792328204279712e-07, + "loss": 0.0004, + "num_tokens": 56870738.0, + "reward": 0.5234375, + "reward_std": 0.015625, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.3519999999999999, + "grad_norm": 0.534164470884044, + "kl": 0.380126953125, + "learning_rate": 1.2666771173434892e-07, + "loss": 0.0004, + "num_tokens": 57020866.0, + "reward": 0.484375, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.0, + "rewards/equation_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.3555555555555556, + "grad_norm": 0.31327674285986346, + "kl": 0.404296875, + "learning_rate": 1.2541623952378655e-07, + "loss": 0.0004, + "num_tokens": 57170962.0, + "reward": 0.5, + "reward_std": 0.022772299125790596, + "rewards/equation_reward_func/mean": 0.015625, + "rewards/equation_reward_func/std": 0.12450689822435379, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.3591111111111112, + "grad_norm": 0.6333355146035587, + "kl": 0.42822265625, + "learning_rate": 1.2416890699515636e-07, + "loss": 0.0004, + "num_tokens": 57320938.0, + "reward": 0.5703125, + "reward_std": 0.046875, + "rewards/equation_reward_func/mean": 0.1640625, + "rewards/equation_reward_func/std": 0.371787428855896, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.3626666666666667, + "grad_norm": 0.8455919752181283, + "kl": 0.582275390625, + "learning_rate": 1.2292575559495143e-07, + "loss": 0.0006, + "num_tokens": 57470974.0, + "reward": 0.5, + "reward_std": 0.037403859198093414, + "rewards/equation_reward_func/mean": 0.0234375, + "rewards/equation_reward_func/std": 0.15188287198543549, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.3662222222222222, + "grad_norm": 0.5563433958155662, + "kl": 0.4375, + "learning_rate": 1.216868266307333e-07, + "loss": 0.0004, + "num_tokens": 57621114.0, + "reward": 0.5234375, + "reward_std": 0.04027109593153, + "rewards/equation_reward_func/mean": 0.0703125, + "rewards/equation_reward_func/std": 0.2566775679588318, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.3697777777777778, + "grad_norm": 0.42654455439415234, + "kl": 0.437744140625, + "learning_rate": 1.2045216126976054e-07, + "loss": 0.0004, + "num_tokens": 57771166.0, + "reward": 0.51953125, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.046875, + "rewards/equation_reward_func/std": 0.21220162510871887, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.3733333333333333, + "grad_norm": 0.574456938538095, + "kl": 0.464111328125, + "learning_rate": 1.1922180053761985e-07, + "loss": 0.0005, + "num_tokens": 57921230.0, + "reward": 0.5390625, + "reward_std": 0.049292195588350296, + "rewards/equation_reward_func/mean": 0.109375, + "rewards/equation_reward_func/std": 0.31333550810813904, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.3768888888888888, + "grad_norm": 0.7083015972044223, + "kl": 0.5546875, + "learning_rate": 1.1799578531686355e-07, + "loss": 0.0006, + "num_tokens": 58071282.0, + "reward": 0.5390625, + "reward_std": 0.04027109593153, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.3804444444444444, + "grad_norm": 0.8083602759346432, + "kl": 0.543212890625, + "learning_rate": 1.1677415634565066e-07, + "loss": 0.0005, + "num_tokens": 58221330.0, + "reward": 0.5625, + "reward_std": 0.07206448912620544, + "rewards/equation_reward_func/mean": 0.171875, + "rewards/equation_reward_func/std": 0.3787541687488556, + "rewards/format_reward_func/mean": 0.953125, + "rewards/format_reward_func/std": 0.21220162510871887, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.384, + "grad_norm": 0.8332269118489771, + "kl": 0.646240234375, + "learning_rate": 1.1555695421639369e-07, + "loss": 0.0006, + "num_tokens": 58371338.0, + "reward": 0.515625, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.0546875, + "rewards/equation_reward_func/std": 0.22826264798641205, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.3875555555555557, + "grad_norm": 0.6373859717105615, + "kl": 0.530029296875, + "learning_rate": 1.1434421937440927e-07, + "loss": 0.0005, + "num_tokens": 58521394.0, + "reward": 0.51953125, + "reward_std": 0.04808359593153, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.3911111111111112, + "grad_norm": 0.5222272117027472, + "kl": 0.438232421875, + "learning_rate": 1.1313599211657493e-07, + "loss": 0.0004, + "num_tokens": 58671582.0, + "reward": 0.50390625, + "reward_std": 0.03619525954127312, + "rewards/equation_reward_func/mean": 0.0390625, + "rewards/equation_reward_func/std": 0.194504976272583, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.3946666666666667, + "grad_norm": 0.44409467166709055, + "kl": 0.444091796875, + "learning_rate": 1.1193231258998933e-07, + "loss": 0.0004, + "num_tokens": 58821646.0, + "reward": 0.56640625, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.1484375, + "rewards/equation_reward_func/std": 0.356930136680603, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.3982222222222223, + "grad_norm": 0.5442250998097384, + "kl": 0.53125, + "learning_rate": 1.1073322079063913e-07, + "loss": 0.0005, + "num_tokens": 58971662.0, + "reward": 0.53515625, + "reward_std": 0.03245859593153, + "rewards/equation_reward_func/mean": 0.078125, + "rewards/equation_reward_func/std": 0.2694226801395416, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.4017777777777778, + "grad_norm": 1.553408988777784, + "kl": 0.532958984375, + "learning_rate": 1.0953875656206896e-07, + "loss": 0.0005, + "num_tokens": 59121722.0, + "reward": 0.52734375, + "reward_std": 0.05710469186306, + "rewards/equation_reward_func/mean": 0.0859375, + "rewards/equation_reward_func/std": 0.2813730239868164, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.4053333333333333, + "grad_norm": 0.5740208836758302, + "kl": 0.438232421875, + "learning_rate": 1.083489595940586e-07, + "loss": 0.0004, + "num_tokens": 59271834.0, + "reward": 0.52734375, + "reward_std": 0.0390625, + "rewards/equation_reward_func/mean": 0.0859375, + "rewards/equation_reward_func/std": 0.2813730239868164, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.4088888888888889, + "grad_norm": 0.4151009063244617, + "kl": 0.42919921875, + "learning_rate": 1.0716386942130312e-07, + "loss": 0.0004, + "num_tokens": 59421910.0, + "reward": 0.5234375, + "reward_std": 0.024646097794175148, + "rewards/equation_reward_func/mean": 0.0546875, + "rewards/equation_reward_func/std": 0.22826264798641205, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.4124444444444444, + "grad_norm": 0.7311720397309218, + "kl": 0.43115234375, + "learning_rate": 1.0598352542210021e-07, + "loss": 0.0004, + "num_tokens": 59571914.0, + "reward": 0.53515625, + "reward_std": 0.05710469186306, + "rewards/equation_reward_func/mean": 0.109375, + "rewards/equation_reward_func/std": 0.31333550810813904, + "rewards/format_reward_func/mean": 0.9609375, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.416, + "grad_norm": 0.6992515497367471, + "kl": 0.637939453125, + "learning_rate": 1.0480796681704077e-07, + "loss": 0.0006, + "num_tokens": 59722026.0, + "reward": 0.53125, + "reward_std": 0.024646097794175148, + "rewards/equation_reward_func/mean": 0.078125, + "rewards/equation_reward_func/std": 0.2694226801395416, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.4195555555555557, + "grad_norm": 0.5217670799751064, + "kl": 0.43408203125, + "learning_rate": 1.0363723266770649e-07, + "loss": 0.0004, + "num_tokens": 59872062.0, + "reward": 0.51171875, + "reward_std": 0.041479695588350296, + "rewards/equation_reward_func/mean": 0.0390625, + "rewards/equation_reward_func/std": 0.194504976272583, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.423111111111111, + "grad_norm": 6.850730801381474, + "kl": 1.605712890625, + "learning_rate": 1.0247136187537123e-07, + "loss": 0.0016, + "num_tokens": 60022170.0, + "reward": 0.5078125, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.03125, + "rewards/equation_reward_func/std": 0.1746762990951538, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.4266666666666667, + "grad_norm": 0.5426555800683942, + "kl": 0.46826171875, + "learning_rate": 1.0131039317970907e-07, + "loss": 0.0005, + "num_tokens": 60172198.0, + "reward": 0.58203125, + "reward_std": 0.0390625, + "rewards/equation_reward_func/mean": 0.1796875, + "rewards/equation_reward_func/std": 0.3854354918003082, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.4302222222222223, + "grad_norm": 0.4359928548595561, + "kl": 0.419921875, + "learning_rate": 1.0015436515750636e-07, + "loss": 0.0004, + "num_tokens": 60322274.0, + "reward": 0.5703125, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.15625, + "rewards/equation_reward_func/std": 0.3645188808441162, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.4337777777777778, + "grad_norm": 0.5019244396354806, + "kl": 0.3994140625, + "learning_rate": 9.900331622138063e-08, + "loss": 0.0004, + "num_tokens": 60472338.0, + "reward": 0.57421875, + "reward_std": 0.041479695588350296, + "rewards/equation_reward_func/mean": 0.1484375, + "rewards/equation_reward_func/std": 0.356930136680603, + "rewards/format_reward_func/mean": 1.0, + "rewards/format_reward_func/std": 0.0, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.4373333333333334, + "grad_norm": 0.6518642054974103, + "kl": 0.466552734375, + "learning_rate": 9.785728461850346e-08, + "loss": 0.0005, + "num_tokens": 60622438.0, + "reward": 0.5390625, + "reward_std": 0.049292195588350296, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.4408888888888889, + "grad_norm": 0.6183842691534489, + "kl": 0.401123046875, + "learning_rate": 9.671630842933027e-08, + "loss": 0.0004, + "num_tokens": 60772530.0, + "reward": 0.60546875, + "reward_std": 0.0546875, + "rewards/equation_reward_func/mean": 0.2421875, + "rewards/equation_reward_func/std": 0.4300905168056488, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.4444444444444444, + "grad_norm": 0.5614152227928939, + "kl": 0.42724609375, + "learning_rate": 9.558042556633439e-08, + "loss": 0.0004, + "num_tokens": 60922622.0, + "reward": 0.515625, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.0546875, + "rewards/equation_reward_func/std": 0.22826264798641205, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.448, + "grad_norm": 0.7515661137862419, + "kl": 0.38232421875, + "learning_rate": 9.44496737727479e-08, + "loss": 0.0004, + "num_tokens": 61072702.0, + "reward": 0.5390625, + "reward_std": 0.07525776326656342, + "rewards/equation_reward_func/mean": 0.1171875, + "rewards/equation_reward_func/std": 0.322907418012619, + "rewards/format_reward_func/mean": 0.9609375, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.4515555555555555, + "grad_norm": 0.6888661898947002, + "kl": 0.446044921875, + "learning_rate": 9.332409062130686e-08, + "loss": 0.0004, + "num_tokens": 61222818.0, + "reward": 0.5546875, + "reward_std": 0.06491719186306, + "rewards/equation_reward_func/mean": 0.15625, + "rewards/equation_reward_func/std": 0.3645188808441162, + "rewards/format_reward_func/mean": 0.953125, + "rewards/format_reward_func/std": 0.21220162510871887, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.455111111111111, + "grad_norm": 5.768513710763025, + "kl": 1.973388671875, + "learning_rate": 9.220371351300352e-08, + "loss": 0.002, + "num_tokens": 61372914.0, + "reward": 0.59375, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.203125, + "rewards/equation_reward_func/std": 0.40390563011169434, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9921875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 800.0, + "completions/mean_length": 1022.25, + "completions/mean_terminated_length": 800.0, + "completions/min_length": 800.0, + "completions/min_terminated_length": 800.0, + "epoch": 1.4586666666666668, + "grad_norm": 0.39703873761876407, + "kl": 0.51416015625, + "learning_rate": 9.10885796758428e-08, + "loss": 0.0005, + "num_tokens": 61522750.0, + "reward": 0.5078125, + "reward_std": 0.015625, + "rewards/equation_reward_func/mean": 0.015625, + "rewards/equation_reward_func/std": 0.12450689822435379, + "rewards/format_reward_func/mean": 1.0, + "rewards/format_reward_func/std": 0.0, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.462222222222222, + "grad_norm": 0.4105299696973317, + "kl": 0.400634765625, + "learning_rate": 8.997872616360603e-08, + "loss": 0.0004, + "num_tokens": 61672774.0, + "reward": 0.5078125, + "reward_std": 0.015625, + "rewards/equation_reward_func/mean": 0.03125, + "rewards/equation_reward_func/std": 0.1746762990951538, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.4657777777777778, + "grad_norm": 0.814786116863996, + "kl": 0.501953125, + "learning_rate": 8.887418985461903e-08, + "loss": 0.0005, + "num_tokens": 61822806.0, + "reward": 0.484375, + "reward_std": 0.0625, + "rewards/equation_reward_func/mean": 0.015625, + "rewards/equation_reward_func/std": 0.12450689822435379, + "rewards/format_reward_func/mean": 0.953125, + "rewards/format_reward_func/std": 0.21220162510871887, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.4693333333333334, + "grad_norm": 0.6543891528432907, + "kl": 0.448974609375, + "learning_rate": 8.777500745052743e-08, + "loss": 0.0004, + "num_tokens": 61972982.0, + "reward": 0.484375, + "reward_std": 0.049292195588350296, + "rewards/equation_reward_func/mean": 0.015625, + "rewards/equation_reward_func/std": 0.12450689822435379, + "rewards/format_reward_func/mean": 0.953125, + "rewards/format_reward_func/std": 0.21220162510871887, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.472888888888889, + "grad_norm": 0.3103412060077848, + "kl": 0.407958984375, + "learning_rate": 8.668121547507634e-08, + "loss": 0.0004, + "num_tokens": 62123038.0, + "reward": 0.53125, + "reward_std": 0.015625, + "rewards/equation_reward_func/mean": 0.0703125, + "rewards/equation_reward_func/std": 0.2566775679588318, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.4764444444444444, + "grad_norm": 0.5962802113040242, + "kl": 0.478515625, + "learning_rate": 8.559285027289753e-08, + "loss": 0.0005, + "num_tokens": 62273026.0, + "reward": 0.59375, + "reward_std": 0.05589609593153, + "rewards/equation_reward_func/mean": 0.2109375, + "rewards/equation_reward_func/std": 0.4095771610736847, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.48, + "grad_norm": 0.6264823134539922, + "kl": 0.448486328125, + "learning_rate": 8.450994800830111e-08, + "loss": 0.0004, + "num_tokens": 62423086.0, + "reward": 0.53125, + "reward_std": 0.05589609593153, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.4835555555555555, + "grad_norm": 0.8450656426538271, + "kl": 0.41796875, + "learning_rate": 8.343254466407435e-08, + "loss": 0.0004, + "num_tokens": 62573242.0, + "reward": 0.5234375, + "reward_std": 0.08427885919809341, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.953125, + "rewards/format_reward_func/std": 0.21220162510871887, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.487111111111111, + "grad_norm": 0.4851276707023854, + "kl": 0.388427734375, + "learning_rate": 8.236067604028562e-08, + "loss": 0.0004, + "num_tokens": 62723310.0, + "reward": 0.515625, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.046875, + "rewards/equation_reward_func/std": 0.21220162510871887, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.4906666666666666, + "grad_norm": 0.529824828272533, + "kl": 0.39306640625, + "learning_rate": 8.129437775309533e-08, + "loss": 0.0004, + "num_tokens": 62873314.0, + "reward": 0.53125, + "reward_std": 0.046875, + "rewards/equation_reward_func/mean": 0.078125, + "rewards/equation_reward_func/std": 0.2694226801395416, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.4942222222222221, + "grad_norm": 0.7362472960586969, + "kl": 0.423583984375, + "learning_rate": 8.023368523357182e-08, + "loss": 0.0004, + "num_tokens": 63023462.0, + "reward": 0.51953125, + "reward_std": 0.06370859593153, + "rewards/equation_reward_func/mean": 0.0859375, + "rewards/equation_reward_func/std": 0.2813730239868164, + "rewards/format_reward_func/mean": 0.953125, + "rewards/format_reward_func/std": 0.21220162510871887, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.4977777777777779, + "grad_norm": 0.7821251913385924, + "kl": 0.479736328125, + "learning_rate": 7.917863372651476e-08, + "loss": 0.0005, + "num_tokens": 63173582.0, + "reward": 0.51171875, + "reward_std": 0.0817507952451706, + "rewards/equation_reward_func/mean": 0.0859375, + "rewards/equation_reward_func/std": 0.2813730239868164, + "rewards/format_reward_func/mean": 0.9375, + "rewards/format_reward_func/std": 0.24301259219646454, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.5013333333333332, + "grad_norm": 0.29499660137723543, + "kl": 0.4189453125, + "learning_rate": 7.812925828928332e-08, + "loss": 0.0004, + "num_tokens": 63323694.0, + "reward": 0.52734375, + "reward_std": 0.0078125, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.504888888888889, + "grad_norm": 3.151628702237811, + "kl": 1.814453125, + "learning_rate": 7.708559379063204e-08, + "loss": 0.0018, + "num_tokens": 63473758.0, + "reward": 0.5390625, + "reward_std": 0.04027109593153, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.5084444444444445, + "grad_norm": 0.42289374767757515, + "kl": 0.399658203125, + "learning_rate": 7.604767490955138e-08, + "loss": 0.0004, + "num_tokens": 63623866.0, + "reward": 0.53515625, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.078125, + "rewards/equation_reward_func/std": 0.2694226801395416, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.512, + "grad_norm": 0.810138453876871, + "kl": 0.49365234375, + "learning_rate": 7.501553613411626e-08, + "loss": 0.0005, + "num_tokens": 63774038.0, + "reward": 0.5234375, + "reward_std": 0.04027109593153, + "rewards/equation_reward_func/mean": 0.0859375, + "rewards/equation_reward_func/std": 0.2813730239868164, + "rewards/format_reward_func/mean": 0.9609375, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.5155555555555555, + "grad_norm": 0.5491563744374505, + "kl": 0.431396484375, + "learning_rate": 7.398921176033928e-08, + "loss": 0.0004, + "num_tokens": 63924126.0, + "reward": 0.50390625, + "reward_std": 0.0390625, + "rewards/equation_reward_func/mean": 0.0234375, + "rewards/equation_reward_func/std": 0.15188287198543549, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.519111111111111, + "grad_norm": 0.7576014460271948, + "kl": 0.43359375, + "learning_rate": 7.296873589103184e-08, + "loss": 0.0004, + "num_tokens": 64074174.0, + "reward": 0.546875, + "reward_std": 0.06491719186306, + "rewards/equation_reward_func/mean": 0.1328125, + "rewards/equation_reward_func/std": 0.3407054841518402, + "rewards/format_reward_func/mean": 0.9609375, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.5226666666666666, + "grad_norm": 0.5650904936953062, + "kl": 0.400634765625, + "learning_rate": 7.195414243467029e-08, + "loss": 0.0004, + "num_tokens": 64224258.0, + "reward": 0.51171875, + "reward_std": 0.0390625, + "rewards/equation_reward_func/mean": 0.046875, + "rewards/equation_reward_func/std": 0.21220162510871887, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.5262222222222221, + "grad_norm": 1.7191187286350949, + "kl": 0.529541015625, + "learning_rate": 7.094546510426994e-08, + "loss": 0.0005, + "num_tokens": 64374342.0, + "reward": 0.55859375, + "reward_std": 0.07933359593153, + "rewards/equation_reward_func/mean": 0.140625, + "rewards/equation_reward_func/std": 0.3490002751350403, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.529777777777778, + "grad_norm": 0.6477386570332615, + "kl": 0.448974609375, + "learning_rate": 6.994273741626405e-08, + "loss": 0.0004, + "num_tokens": 64524470.0, + "reward": 0.59375, + "reward_std": 0.0625, + "rewards/equation_reward_func/mean": 0.21875, + "rewards/equation_reward_func/std": 0.41502299904823303, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.5333333333333332, + "grad_norm": 0.49484841194714935, + "kl": 0.42822265625, + "learning_rate": 6.8945992689391e-08, + "loss": 0.0004, + "num_tokens": 64674642.0, + "reward": 0.57421875, + "reward_std": 0.0390625, + "rewards/equation_reward_func/mean": 0.1484375, + "rewards/equation_reward_func/std": 0.356930136680603, + "rewards/format_reward_func/mean": 1.0, + "rewards/format_reward_func/std": 0.0, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.536888888888889, + "grad_norm": 0.5641664834282943, + "kl": 0.511962890625, + "learning_rate": 6.795526404358628e-08, + "loss": 0.0005, + "num_tokens": 64824714.0, + "reward": 0.55859375, + "reward_std": 0.05050079524517059, + "rewards/equation_reward_func/mean": 0.140625, + "rewards/equation_reward_func/std": 0.3490002751350403, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.5404444444444443, + "grad_norm": 0.7774048704942202, + "kl": 0.4755859375, + "learning_rate": 6.697058439888283e-08, + "loss": 0.0005, + "num_tokens": 64974826.0, + "reward": 0.52734375, + "reward_std": 0.08307026326656342, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.9609375, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.544, + "grad_norm": 0.5632886984383448, + "kl": 0.44482421875, + "learning_rate": 6.599198647431642e-08, + "loss": 0.0004, + "num_tokens": 65125010.0, + "reward": 0.5390625, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.1015625, + "rewards/equation_reward_func/std": 0.3032590448856354, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.5475555555555556, + "grad_norm": 0.6453682372916957, + "kl": 0.515869140625, + "learning_rate": 6.501950278683907e-08, + "loss": 0.0005, + "num_tokens": 65275094.0, + "reward": 0.54296875, + "reward_std": 0.0390625, + "rewards/equation_reward_func/mean": 0.1171875, + "rewards/equation_reward_func/std": 0.322907418012619, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.551111111111111, + "grad_norm": 0.5003006401865986, + "kl": 0.449462890625, + "learning_rate": 6.405316565023805e-08, + "loss": 0.0004, + "num_tokens": 65425174.0, + "reward": 0.55078125, + "reward_std": 0.03619525954127312, + "rewards/equation_reward_func/mean": 0.1328125, + "rewards/equation_reward_func/std": 0.3407054841518402, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.5546666666666666, + "grad_norm": 0.468183524036864, + "kl": 0.427490234375, + "learning_rate": 6.309300717406274e-08, + "loss": 0.0004, + "num_tokens": 65575170.0, + "reward": 0.578125, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.171875, + "rewards/equation_reward_func/std": 0.3787541687488556, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.5582222222222222, + "grad_norm": 0.4013690840146761, + "kl": 0.392333984375, + "learning_rate": 6.213905926255697e-08, + "loss": 0.0004, + "num_tokens": 65725174.0, + "reward": 0.5625, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.140625, + "rewards/equation_reward_func/std": 0.3490002751350403, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.561777777777778, + "grad_norm": 0.5052845789767202, + "kl": 0.395263671875, + "learning_rate": 6.119135361359965e-08, + "loss": 0.0004, + "num_tokens": 65875286.0, + "reward": 0.51953125, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.0546875, + "rewards/equation_reward_func/std": 0.22826264798641205, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.5653333333333332, + "grad_norm": 0.7242539146561551, + "kl": 0.483642578125, + "learning_rate": 6.024992171765089e-08, + "loss": 0.0005, + "num_tokens": 66025386.0, + "reward": 0.5859375, + "reward_std": 0.07866839319467545, + "rewards/equation_reward_func/mean": 0.1953125, + "rewards/equation_reward_func/std": 0.3979988098144531, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.568888888888889, + "grad_norm": 0.3684742193914073, + "kl": 0.396728515625, + "learning_rate": 5.9314794856705983e-08, + "loss": 0.0004, + "num_tokens": 66175466.0, + "reward": 0.546875, + "reward_std": 0.037403859198093414, + "rewards/equation_reward_func/mean": 0.109375, + "rewards/equation_reward_func/std": 0.31333550810813904, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.5724444444444443, + "grad_norm": 0.49890493402893316, + "kl": 0.42822265625, + "learning_rate": 5.8386004103255975e-08, + "loss": 0.0004, + "num_tokens": 66325550.0, + "reward": 0.53515625, + "reward_std": 0.0390625, + "rewards/equation_reward_func/mean": 0.078125, + "rewards/equation_reward_func/std": 0.2694226801395416, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.576, + "grad_norm": 0.4438983691416006, + "kl": 0.461181640625, + "learning_rate": 5.7463580319254853e-08, + "loss": 0.0005, + "num_tokens": 66475646.0, + "reward": 0.51171875, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.0390625, + "rewards/equation_reward_func/std": 0.194504976272583, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.5795555555555556, + "grad_norm": 0.48341809431444255, + "kl": 0.456298828125, + "learning_rate": 5.6547554155094626e-08, + "loss": 0.0005, + "num_tokens": 66625682.0, + "reward": 0.53515625, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.5831111111111111, + "grad_norm": 0.7056142221033674, + "kl": 0.470703125, + "learning_rate": 5.563795604858615e-08, + "loss": 0.0005, + "num_tokens": 66775798.0, + "reward": 0.52734375, + "reward_std": 0.06370859593153, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.9609375, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.5866666666666667, + "grad_norm": 6.779742188034057, + "kl": 0.804931640625, + "learning_rate": 5.473481622394849e-08, + "loss": 0.0008, + "num_tokens": 66925906.0, + "reward": 0.546875, + "reward_std": 0.046875, + "rewards/equation_reward_func/mean": 0.1171875, + "rewards/equation_reward_func/std": 0.322907418012619, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.5902222222222222, + "grad_norm": 1.40942249947706, + "kl": 0.533203125, + "learning_rate": 5.3838164690803935e-08, + "loss": 0.0005, + "num_tokens": 67075970.0, + "reward": 0.51171875, + "reward_std": 0.0390625, + "rewards/equation_reward_func/mean": 0.0546875, + "rewards/equation_reward_func/std": 0.22826264798641205, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.5937777777777777, + "grad_norm": 0.5709869106367856, + "kl": 0.514892578125, + "learning_rate": 5.294803124318145e-08, + "loss": 0.0005, + "num_tokens": 67226142.0, + "reward": 0.5390625, + "reward_std": 0.047418396919965744, + "rewards/equation_reward_func/mean": 0.1171875, + "rewards/equation_reward_func/std": 0.322907418012619, + "rewards/format_reward_func/mean": 0.9609375, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.5973333333333333, + "grad_norm": 0.7428039346413163, + "kl": 0.47802734375, + "learning_rate": 5.20644454585262e-08, + "loss": 0.0005, + "num_tokens": 67376314.0, + "reward": 0.51171875, + "reward_std": 0.06370859593153, + "rewards/equation_reward_func/mean": 0.0703125, + "rewards/equation_reward_func/std": 0.2566775679588318, + "rewards/format_reward_func/mean": 0.953125, + "rewards/format_reward_func/std": 0.21220162510871887, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.600888888888889, + "grad_norm": 0.47757983506186286, + "kl": 0.44580078125, + "learning_rate": 5.1187436696716906e-08, + "loss": 0.0004, + "num_tokens": 67526314.0, + "reward": 0.52734375, + "reward_std": 0.030584799125790596, + "rewards/equation_reward_func/mean": 0.078125, + "rewards/equation_reward_func/std": 0.2694226801395416, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.6044444444444443, + "grad_norm": 0.6183437552202213, + "kl": 0.5078125, + "learning_rate": 5.0317034099090524e-08, + "loss": 0.0005, + "num_tokens": 67676354.0, + "reward": 0.52734375, + "reward_std": 0.04808359593153, + "rewards/equation_reward_func/mean": 0.0859375, + "rewards/equation_reward_func/std": 0.2813730239868164, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.608, + "grad_norm": 0.584260562502066, + "kl": 0.486083984375, + "learning_rate": 4.9453266587473423e-08, + "loss": 0.0005, + "num_tokens": 67826470.0, + "reward": 0.5546875, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.125, + "rewards/equation_reward_func/std": 0.3320184051990509, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.6115555555555554, + "grad_norm": 0.536997119691752, + "kl": 0.43310546875, + "learning_rate": 4.859616286322094e-08, + "loss": 0.0004, + "num_tokens": 67976526.0, + "reward": 0.5546875, + "reward_std": 0.04027109593153, + "rewards/equation_reward_func/mean": 0.1328125, + "rewards/equation_reward_func/std": 0.3407054841518402, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.6151111111111112, + "grad_norm": 0.4285196702276041, + "kl": 0.413818359375, + "learning_rate": 4.774575140626316e-08, + "loss": 0.0004, + "num_tokens": 68126606.0, + "reward": 0.55859375, + "reward_std": 0.025854695588350296, + "rewards/equation_reward_func/mean": 0.125, + "rewards/equation_reward_func/std": 0.3320184051990509, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.6186666666666667, + "grad_norm": 0.6432882809805262, + "kl": 0.40478515625, + "learning_rate": 4.6902060474159036e-08, + "loss": 0.0004, + "num_tokens": 68276658.0, + "reward": 0.59375, + "reward_std": 0.05589609593153, + "rewards/equation_reward_func/mean": 0.1953125, + "rewards/equation_reward_func/std": 0.3979988098144531, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.6222222222222222, + "grad_norm": 0.670105632966217, + "kl": 0.43212890625, + "learning_rate": 4.6065118101157016e-08, + "loss": 0.0004, + "num_tokens": 68426750.0, + "reward": 0.5078125, + "reward_std": 0.06204995512962341, + "rewards/equation_reward_func/mean": 0.0546875, + "rewards/equation_reward_func/std": 0.22826264798641205, + "rewards/format_reward_func/mean": 0.9609375, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.6257777777777778, + "grad_norm": 0.7554410323140208, + "kl": 0.504150390625, + "learning_rate": 4.5234952097263965e-08, + "loss": 0.0005, + "num_tokens": 68576790.0, + "reward": 0.5625, + "reward_std": 0.046875, + "rewards/equation_reward_func/mean": 0.1484375, + "rewards/equation_reward_func/std": 0.356930136680603, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.6293333333333333, + "grad_norm": 0.5383642917248337, + "kl": 0.489013671875, + "learning_rate": 4.4411590047320617e-08, + "loss": 0.0005, + "num_tokens": 68726862.0, + "reward": 0.5625, + "reward_std": 0.04027109593153, + "rewards/equation_reward_func/mean": 0.140625, + "rewards/equation_reward_func/std": 0.3490002751350403, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.6328888888888888, + "grad_norm": 0.6764051889067699, + "kl": 0.48876953125, + "learning_rate": 4.359505931008553e-08, + "loss": 0.0005, + "num_tokens": 68877026.0, + "reward": 0.5546875, + "reward_std": 0.046875, + "rewards/equation_reward_func/mean": 0.1328125, + "rewards/equation_reward_func/std": 0.3407054841518402, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.6364444444444444, + "grad_norm": 0.608366482588541, + "kl": 0.46923828125, + "learning_rate": 4.278538701732534e-08, + "loss": 0.0005, + "num_tokens": 69027150.0, + "reward": 0.546875, + "reward_std": 0.049292195588350296, + "rewards/equation_reward_func/mean": 0.109375, + "rewards/equation_reward_func/std": 0.31333550810813904, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.6400000000000001, + "grad_norm": 0.6650025403400517, + "kl": 0.48193359375, + "learning_rate": 4.198260007291399e-08, + "loss": 0.0005, + "num_tokens": 69177238.0, + "reward": 0.5, + "reward_std": 0.0625, + "rewards/equation_reward_func/mean": 0.0390625, + "rewards/equation_reward_func/std": 0.194504976272583, + "rewards/format_reward_func/mean": 0.9609375, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.6435555555555554, + "grad_norm": 0.4721462056014319, + "kl": 0.51953125, + "learning_rate": 4.118672515193794e-08, + "loss": 0.0005, + "num_tokens": 69327298.0, + "reward": 0.5390625, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.1015625, + "rewards/equation_reward_func/std": 0.3032590448856354, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.6471111111111112, + "grad_norm": 0.6621926942392276, + "kl": 0.495361328125, + "learning_rate": 4.039778869981064e-08, + "loss": 0.0005, + "num_tokens": 69477314.0, + "reward": 0.546875, + "reward_std": 0.05589609593153, + "rewards/equation_reward_func/mean": 0.1171875, + "rewards/equation_reward_func/std": 0.322907418012619, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.6506666666666665, + "grad_norm": 0.7308447153300471, + "kl": 0.541748046875, + "learning_rate": 3.961581693139307e-08, + "loss": 0.0005, + "num_tokens": 69627398.0, + "reward": 0.55859375, + "reward_std": 0.0078125, + "rewards/equation_reward_func/mean": 0.125, + "rewards/equation_reward_func/std": 0.3320184051990509, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.6542222222222223, + "grad_norm": 0.847646129412192, + "kl": 0.5849609375, + "learning_rate": 3.884083583012318e-08, + "loss": 0.0006, + "num_tokens": 69777462.0, + "reward": 0.58203125, + "reward_std": 0.0703125, + "rewards/equation_reward_func/mean": 0.1875, + "rewards/equation_reward_func/std": 0.39184603095054626, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.6577777777777778, + "grad_norm": 0.5867478692263943, + "kl": 0.453857421875, + "learning_rate": 3.807287114715216e-08, + "loss": 0.0005, + "num_tokens": 69927534.0, + "reward": 0.546875, + "reward_std": 0.046875, + "rewards/equation_reward_func/mean": 0.1171875, + "rewards/equation_reward_func/std": 0.322907418012619, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.6613333333333333, + "grad_norm": 0.5569296548464157, + "kl": 0.53515625, + "learning_rate": 3.731194840048915e-08, + "loss": 0.0005, + "num_tokens": 70077530.0, + "reward": 0.58203125, + "reward_std": 0.04808359593153, + "rewards/equation_reward_func/mean": 0.171875, + "rewards/equation_reward_func/std": 0.3787541687488556, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.6648888888888889, + "grad_norm": 0.47294248502288805, + "kl": 0.427490234375, + "learning_rate": 3.655809287415284e-08, + "loss": 0.0004, + "num_tokens": 70227594.0, + "reward": 0.5234375, + "reward_std": 0.04027109593153, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.6684444444444444, + "grad_norm": 0.4574981255397005, + "kl": 0.4287109375, + "learning_rate": 3.581132961733191e-08, + "loss": 0.0004, + "num_tokens": 70377714.0, + "reward": 0.53515625, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.078125, + "rewards/equation_reward_func/std": 0.2694226801395416, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.6720000000000002, + "grad_norm": 0.40519803603997345, + "kl": 0.4169921875, + "learning_rate": 3.5071683443552045e-08, + "loss": 0.0004, + "num_tokens": 70527870.0, + "reward": 0.52734375, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.6755555555555555, + "grad_norm": 0.4840050767135489, + "kl": 0.47607421875, + "learning_rate": 3.433917892985208e-08, + "loss": 0.0005, + "num_tokens": 70677974.0, + "reward": 0.4921875, + "reward_std": 0.028382759541273117, + "rewards/equation_reward_func/mean": 0.0078125, + "rewards/equation_reward_func/std": 0.0883883461356163, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.6791111111111112, + "grad_norm": 0.5689276035825345, + "kl": 0.447509765625, + "learning_rate": 3.3613840415966764e-08, + "loss": 0.0004, + "num_tokens": 70828062.0, + "reward": 0.5078125, + "reward_std": 0.04027109593153, + "rewards/equation_reward_func/mean": 0.03125, + "rewards/equation_reward_func/std": 0.1746762990951538, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.6826666666666665, + "grad_norm": 0.3906743686263679, + "kl": 0.39892578125, + "learning_rate": 3.2895692003518575e-08, + "loss": 0.0004, + "num_tokens": 70978234.0, + "reward": 0.48828125, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.0, + "rewards/equation_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.6862222222222223, + "grad_norm": 0.6319095516033488, + "kl": 0.4541015625, + "learning_rate": 3.218475755521621e-08, + "loss": 0.0005, + "num_tokens": 71128366.0, + "reward": 0.55859375, + "reward_std": 0.05710469186306, + "rewards/equation_reward_func/mean": 0.140625, + "rewards/equation_reward_func/std": 0.3490002751350403, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.6897777777777778, + "grad_norm": 0.6527920446086392, + "kl": 0.404541015625, + "learning_rate": 3.1481060694062365e-08, + "loss": 0.0004, + "num_tokens": 71278446.0, + "reward": 0.53515625, + "reward_std": 0.06370859593153, + "rewards/equation_reward_func/mean": 0.1015625, + "rewards/equation_reward_func/std": 0.3032590448856354, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.6933333333333334, + "grad_norm": 0.609657629006971, + "kl": 0.41162109375, + "learning_rate": 3.078462480256819e-08, + "loss": 0.0004, + "num_tokens": 71428610.0, + "reward": 0.53125, + "reward_std": 0.058313291519880295, + "rewards/equation_reward_func/mean": 0.109375, + "rewards/equation_reward_func/std": 0.31333550810813904, + "rewards/format_reward_func/mean": 0.953125, + "rewards/format_reward_func/std": 0.21220162510871887, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.696888888888889, + "grad_norm": 0.8910468177123286, + "kl": 0.513427734375, + "learning_rate": 3.0095473021976794e-08, + "loss": 0.0005, + "num_tokens": 71578734.0, + "reward": 0.5078125, + "reward_std": 0.07152109593153, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.953125, + "rewards/format_reward_func/std": 0.21220162510871887, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.7004444444444444, + "grad_norm": 0.8425501635040524, + "kl": 0.51416015625, + "learning_rate": 2.9413628251493934e-08, + "loss": 0.0005, + "num_tokens": 71728878.0, + "reward": 0.55078125, + "reward_std": 0.09077189117670059, + "rewards/equation_reward_func/mean": 0.1484375, + "rewards/equation_reward_func/std": 0.356930136680603, + "rewards/format_reward_func/mean": 0.953125, + "rewards/format_reward_func/std": 0.21220162510871887, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.704, + "grad_norm": 0.6418930440563599, + "kl": 0.430908203125, + "learning_rate": 2.8739113147527417e-08, + "loss": 0.0004, + "num_tokens": 71878958.0, + "reward": 0.5703125, + "reward_std": 0.05589609593153, + "rewards/equation_reward_func/mean": 0.15625, + "rewards/equation_reward_func/std": 0.3645188808441162, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.7075555555555555, + "grad_norm": 1.0250618122706754, + "kl": 0.630859375, + "learning_rate": 2.8071950122934036e-08, + "loss": 0.0006, + "num_tokens": 72029110.0, + "reward": 0.53515625, + "reward_std": 0.07085589319467545, + "rewards/equation_reward_func/mean": 0.1171875, + "rewards/equation_reward_func/std": 0.322907418012619, + "rewards/format_reward_func/mean": 0.953125, + "rewards/format_reward_func/std": 0.21220162510871887, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.7111111111111112, + "grad_norm": 0.6070267863410984, + "kl": 0.4892578125, + "learning_rate": 2.7412161346275052e-08, + "loss": 0.0005, + "num_tokens": 72179178.0, + "reward": 0.53515625, + "reward_std": 0.0390625, + "rewards/equation_reward_func/mean": 0.1015625, + "rewards/equation_reward_func/std": 0.3032590448856354, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.7146666666666666, + "grad_norm": 0.41436971147539053, + "kl": 0.473388671875, + "learning_rate": 2.675976874107935e-08, + "loss": 0.0005, + "num_tokens": 72329226.0, + "reward": 0.51171875, + "reward_std": 0.03245859593153, + "rewards/equation_reward_func/mean": 0.0390625, + "rewards/equation_reward_func/std": 0.194504976272583, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.7182222222222223, + "grad_norm": 0.7994525127067181, + "kl": 0.623046875, + "learning_rate": 2.611479398511518e-08, + "loss": 0.0006, + "num_tokens": 72479326.0, + "reward": 0.51953125, + "reward_std": 0.0546875, + "rewards/equation_reward_func/mean": 0.078125, + "rewards/equation_reward_func/std": 0.2694226801395416, + "rewards/format_reward_func/mean": 0.9609375, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.7217777777777776, + "grad_norm": 3.591555794980261, + "kl": 0.92333984375, + "learning_rate": 2.5477258509669614e-08, + "loss": 0.0009, + "num_tokens": 72629430.0, + "reward": 0.55859375, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.125, + "rewards/equation_reward_func/std": 0.3320184051990509, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.7253333333333334, + "grad_norm": 0.5523457619977477, + "kl": 0.407958984375, + "learning_rate": 2.4847183498836714e-08, + "loss": 0.0004, + "num_tokens": 72779562.0, + "reward": 0.55078125, + "reward_std": 0.0546875, + "rewards/equation_reward_func/mean": 0.125, + "rewards/equation_reward_func/std": 0.3320184051990509, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.728888888888889, + "grad_norm": 0.5337513777324062, + "kl": 0.489013671875, + "learning_rate": 2.4224589888813263e-08, + "loss": 0.0005, + "num_tokens": 72929630.0, + "reward": 0.49609375, + "reward_std": 0.0390625, + "rewards/equation_reward_func/mean": 0.015625, + "rewards/equation_reward_func/std": 0.12450689822435379, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.7324444444444445, + "grad_norm": 3.336975111696118, + "kl": 0.9912109375, + "learning_rate": 2.3609498367203467e-08, + "loss": 0.001, + "num_tokens": 73079714.0, + "reward": 0.49609375, + "reward_std": 0.06744526326656342, + "rewards/equation_reward_func/mean": 0.046875, + "rewards/equation_reward_func/std": 0.21220162510871887, + "rewards/format_reward_func/mean": 0.9453125, + "rewards/format_reward_func/std": 0.22826264798641205, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.736, + "grad_norm": 0.37339089559378696, + "kl": 0.4267578125, + "learning_rate": 2.300192937233128e-08, + "loss": 0.0004, + "num_tokens": 73229834.0, + "reward": 0.51953125, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.0546875, + "rewards/equation_reward_func/std": 0.22826264798641205, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.7395555555555555, + "grad_norm": 0.5184832733671315, + "kl": 0.509033203125, + "learning_rate": 2.240190309256143e-08, + "loss": 0.0005, + "num_tokens": 73379898.0, + "reward": 0.5, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.0234375, + "rewards/equation_reward_func/std": 0.15188287198543549, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.743111111111111, + "grad_norm": 0.937777108795673, + "kl": 0.7412109375, + "learning_rate": 2.1809439465628382e-08, + "loss": 0.0007, + "num_tokens": 73530006.0, + "reward": 0.5625, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.1484375, + "rewards/equation_reward_func/std": 0.356930136680603, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.7466666666666666, + "grad_norm": 0.44792777397212485, + "kl": 0.432373046875, + "learning_rate": 2.122455817797428e-08, + "loss": 0.0004, + "num_tokens": 73680138.0, + "reward": 0.50390625, + "reward_std": 0.016833597794175148, + "rewards/equation_reward_func/mean": 0.015625, + "rewards/equation_reward_func/std": 0.12450689822435379, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.7502222222222223, + "grad_norm": 1.8165365556826047, + "kl": 1.04248046875, + "learning_rate": 2.0647278664094188e-08, + "loss": 0.001, + "num_tokens": 73830170.0, + "reward": 0.53125, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.0859375, + "rewards/equation_reward_func/std": 0.2813730239868164, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.7537777777777777, + "grad_norm": 0.5746953789475809, + "kl": 0.496337890625, + "learning_rate": 2.007762010589098e-08, + "loss": 0.0005, + "num_tokens": 73980250.0, + "reward": 0.515625, + "reward_std": 0.04027109593153, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.9375, + "rewards/format_reward_func/std": 0.24301259219646454, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.7573333333333334, + "grad_norm": 34.458999462219055, + "kl": 9.58984375, + "learning_rate": 1.9515601432037317e-08, + "loss": 0.0097, + "num_tokens": 74130306.0, + "reward": 0.51953125, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.7608888888888887, + "grad_norm": 1.1445300736653963, + "kl": 0.626953125, + "learning_rate": 1.8961241317347333e-08, + "loss": 0.0006, + "num_tokens": 74280438.0, + "reward": 0.515625, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.7644444444444445, + "grad_norm": 0.530745335656726, + "kl": 0.537109375, + "learning_rate": 1.8414558182155456e-08, + "loss": 0.0005, + "num_tokens": 74430474.0, + "reward": 0.50390625, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.03125, + "rewards/equation_reward_func/std": 0.1746762990951538, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.768, + "grad_norm": 4.325748345041621, + "kl": 0.811767578125, + "learning_rate": 1.787557019170488e-08, + "loss": 0.0008, + "num_tokens": 74580578.0, + "reward": 0.53515625, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.0859375, + "rewards/equation_reward_func/std": 0.2813730239868164, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.7715555555555556, + "grad_norm": 0.6871408255583382, + "kl": 0.51953125, + "learning_rate": 1.734429525554365e-08, + "loss": 0.0005, + "num_tokens": 74730626.0, + "reward": 0.4921875, + "reward_std": 0.04929219186306, + "rewards/equation_reward_func/mean": 0.03125, + "rewards/equation_reward_func/std": 0.1746762990951538, + "rewards/format_reward_func/mean": 0.953125, + "rewards/format_reward_func/std": 0.21220162510871887, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.775111111111111, + "grad_norm": 0.5276217136164556, + "kl": 0.505859375, + "learning_rate": 1.6820751026929674e-08, + "loss": 0.0005, + "num_tokens": 74880714.0, + "reward": 0.5, + "reward_std": 0.04642495512962341, + "rewards/equation_reward_func/mean": 0.046875, + "rewards/equation_reward_func/std": 0.21220162510871887, + "rewards/format_reward_func/mean": 0.953125, + "rewards/format_reward_func/std": 0.21220162510871887, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.7786666666666666, + "grad_norm": 0.6364230686117014, + "kl": 0.4501953125, + "learning_rate": 1.6304954902244095e-08, + "loss": 0.0005, + "num_tokens": 75030838.0, + "reward": 0.52734375, + "reward_std": 0.0390625, + "rewards/equation_reward_func/mean": 0.0859375, + "rewards/equation_reward_func/std": 0.2813730239868164, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.7822222222222224, + "grad_norm": 0.45223361174699134, + "kl": 0.44482421875, + "learning_rate": 1.5796924020413327e-08, + "loss": 0.0004, + "num_tokens": 75180934.0, + "reward": 0.578125, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.171875, + "rewards/equation_reward_func/std": 0.3787541687488556, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.7857777777777777, + "grad_norm": 3.248583973869253, + "kl": 0.61669921875, + "learning_rate": 1.529667526233941e-08, + "loss": 0.0006, + "num_tokens": 75330966.0, + "reward": 0.57421875, + "reward_std": 0.016833597794175148, + "rewards/equation_reward_func/mean": 0.1640625, + "rewards/equation_reward_func/std": 0.371787428855896, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.7893333333333334, + "grad_norm": 0.4374604603350181, + "kl": 0.434326171875, + "learning_rate": 1.4804225250339281e-08, + "loss": 0.0004, + "num_tokens": 75481026.0, + "reward": 0.50390625, + "reward_std": 0.03245859593153, + "rewards/equation_reward_func/mean": 0.0390625, + "rewards/equation_reward_func/std": 0.194504976272583, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.7928888888888888, + "grad_norm": 3.0992385739650823, + "kl": 0.75537109375, + "learning_rate": 1.4319590347592254e-08, + "loss": 0.0008, + "num_tokens": 75631058.0, + "reward": 0.546875, + "reward_std": 0.06304339319467545, + "rewards/equation_reward_func/mean": 0.1328125, + "rewards/equation_reward_func/std": 0.3407054841518402, + "rewards/format_reward_func/mean": 0.9609375, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.7964444444444445, + "grad_norm": 0.652594553880175, + "kl": 0.421142578125, + "learning_rate": 1.3842786657596446e-08, + "loss": 0.0004, + "num_tokens": 75781122.0, + "reward": 0.52734375, + "reward_std": 0.0546875, + "rewards/equation_reward_func/mean": 0.0859375, + "rewards/equation_reward_func/std": 0.2813730239868164, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8, + "grad_norm": 0.765818438315625, + "kl": 0.432373046875, + "learning_rate": 1.3373830023633597e-08, + "loss": 0.0004, + "num_tokens": 75931254.0, + "reward": 0.515625, + "reward_std": 0.08956328779459, + "rewards/equation_reward_func/mean": 0.1015625, + "rewards/equation_reward_func/std": 0.3032590448856354, + "rewards/format_reward_func/mean": 0.9296875, + "rewards/format_reward_func/std": 0.2566775679588318, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8035555555555556, + "grad_norm": 0.3411460860044164, + "kl": 0.439453125, + "learning_rate": 1.2912736028242777e-08, + "loss": 0.0004, + "num_tokens": 76081346.0, + "reward": 0.5390625, + "reward_std": 0.015625, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8071111111111111, + "grad_norm": 0.7582340748491105, + "kl": 0.58740234375, + "learning_rate": 1.2459519992702311e-08, + "loss": 0.0006, + "num_tokens": 76231422.0, + "reward": 0.515625, + "reward_std": 0.04027109593153, + "rewards/equation_reward_func/mean": 0.0703125, + "rewards/equation_reward_func/std": 0.2566775679588318, + "rewards/format_reward_func/mean": 0.9609375, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8106666666666666, + "grad_norm": 0.6931442469315353, + "kl": 0.43994140625, + "learning_rate": 1.2014196976521035e-08, + "loss": 0.0004, + "num_tokens": 76381442.0, + "reward": 0.578125, + "reward_std": 0.07206448912620544, + "rewards/equation_reward_func/mean": 0.1875, + "rewards/equation_reward_func/std": 0.39184603095054626, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8142222222222222, + "grad_norm": 0.6242469027810024, + "kl": 0.418701171875, + "learning_rate": 1.1576781776937634e-08, + "loss": 0.0004, + "num_tokens": 76531578.0, + "reward": 0.53125, + "reward_std": 0.046875, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8177777777777777, + "grad_norm": 0.5319993004285838, + "kl": 0.42333984375, + "learning_rate": 1.1147288928429116e-08, + "loss": 0.0004, + "num_tokens": 76681742.0, + "reward": 0.53125, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.0859375, + "rewards/equation_reward_func/std": 0.2813730239868164, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8213333333333335, + "grad_norm": 4.892495144239733, + "kl": 1.937744140625, + "learning_rate": 1.0725732702227735e-08, + "loss": 0.0019, + "num_tokens": 76831858.0, + "reward": 0.515625, + "reward_std": 0.05589609593153, + "rewards/equation_reward_func/mean": 0.078125, + "rewards/equation_reward_func/std": 0.2694226801395416, + "rewards/format_reward_func/mean": 0.953125, + "rewards/format_reward_func/std": 0.21220162510871887, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8248888888888888, + "grad_norm": 1.0196354843303994, + "kl": 0.447998046875, + "learning_rate": 1.0312127105846947e-08, + "loss": 0.0004, + "num_tokens": 76981942.0, + "reward": 0.5546875, + "reward_std": 0.04027109593153, + "rewards/equation_reward_func/mean": 0.1328125, + "rewards/equation_reward_func/std": 0.3407054841518402, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8284444444444445, + "grad_norm": 0.7205299980402574, + "kl": 0.420654296875, + "learning_rate": 9.906485882615695e-09, + "loss": 0.0004, + "num_tokens": 77132030.0, + "reward": 0.51953125, + "reward_std": 0.057104695588350296, + "rewards/equation_reward_func/mean": 0.0703125, + "rewards/equation_reward_func/std": 0.2566775679588318, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8319999999999999, + "grad_norm": 4.0685262934583095, + "kl": 1.60107421875, + "learning_rate": 9.50882251122212e-09, + "loss": 0.0016, + "num_tokens": 77282094.0, + "reward": 0.52734375, + "reward_std": 0.03245859593153, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.9609375, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8355555555555556, + "grad_norm": 0.627281356606571, + "kl": 0.442626953125, + "learning_rate": 9.119150205265324e-09, + "loss": 0.0004, + "num_tokens": 77432230.0, + "reward": 0.51953125, + "reward_std": 0.0546875, + "rewards/equation_reward_func/mean": 0.0703125, + "rewards/equation_reward_func/std": 0.2566775679588318, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8391111111111111, + "grad_norm": 0.3858806846831556, + "kl": 0.45751953125, + "learning_rate": 8.737481912816592e-09, + "loss": 0.0005, + "num_tokens": 77582310.0, + "reward": 0.58984375, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.1953125, + "rewards/equation_reward_func/std": 0.3979988098144531, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8426666666666667, + "grad_norm": 0.23991137840603588, + "kl": 0.4140625, + "learning_rate": 8.363830315988945e-09, + "loss": 0.0004, + "num_tokens": 77732286.0, + "reward": 0.52734375, + "reward_std": 0.0078125, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8462222222222222, + "grad_norm": 0.4740890347365351, + "kl": 0.447998046875, + "learning_rate": 7.99820783051583e-09, + "loss": 0.0004, + "num_tokens": 77882334.0, + "reward": 0.56640625, + "reward_std": 0.0390625, + "rewards/equation_reward_func/mean": 0.1484375, + "rewards/equation_reward_func/std": 0.356930136680603, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8497777777777777, + "grad_norm": 0.5700442254866067, + "kl": 0.484375, + "learning_rate": 7.640626605338624e-09, + "loss": 0.0005, + "num_tokens": 78032414.0, + "reward": 0.515625, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8533333333333335, + "grad_norm": 0.41681350867652445, + "kl": 0.423095703125, + "learning_rate": 7.291098522202776e-09, + "loss": 0.0004, + "num_tokens": 78182498.0, + "reward": 0.546875, + "reward_std": 0.015625, + "rewards/equation_reward_func/mean": 0.1015625, + "rewards/equation_reward_func/std": 0.3032590448856354, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8568888888888888, + "grad_norm": 0.7337724383393545, + "kl": 0.47021484375, + "learning_rate": 6.949635195263259e-09, + "loss": 0.0005, + "num_tokens": 78332678.0, + "reward": 0.546875, + "reward_std": 0.058313291519880295, + "rewards/equation_reward_func/mean": 0.140625, + "rewards/equation_reward_func/std": 0.3490002751350403, + "rewards/format_reward_func/mean": 0.953125, + "rewards/format_reward_func/std": 0.21220162510871887, + "step": 522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8604444444444446, + "grad_norm": 0.4627777960335074, + "kl": 0.410888671875, + "learning_rate": 6.616247970698319e-09, + "loss": 0.0004, + "num_tokens": 78482726.0, + "reward": 0.58203125, + "reward_std": 0.04808359593153, + "rewards/equation_reward_func/mean": 0.1875, + "rewards/equation_reward_func/std": 0.39184603095054626, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8639999999999999, + "grad_norm": 0.9327871091407389, + "kl": 0.59765625, + "learning_rate": 6.290947926332835e-09, + "loss": 0.0006, + "num_tokens": 78632766.0, + "reward": 0.5234375, + "reward_std": 0.07152109593153, + "rewards/equation_reward_func/mean": 0.1015625, + "rewards/equation_reward_func/std": 0.3032590448856354, + "rewards/format_reward_func/mean": 0.9453125, + "rewards/format_reward_func/std": 0.22826264798641205, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8675555555555556, + "grad_norm": 0.5684611750891795, + "kl": 0.43798828125, + "learning_rate": 5.97374587126992e-09, + "loss": 0.0004, + "num_tokens": 78782810.0, + "reward": 0.5390625, + "reward_std": 0.05589609593153, + "rewards/equation_reward_func/mean": 0.1171875, + "rewards/equation_reward_func/std": 0.322907418012619, + "rewards/format_reward_func/mean": 0.9609375, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.871111111111111, + "grad_norm": 0.41112080183218475, + "kl": 0.45068359375, + "learning_rate": 5.664652345531845e-09, + "loss": 0.0005, + "num_tokens": 78932894.0, + "reward": 0.53515625, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.078125, + "rewards/equation_reward_func/std": 0.2694226801395416, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8746666666666667, + "grad_norm": 0.6226854601113873, + "kl": 0.4345703125, + "learning_rate": 5.363677619709933e-09, + "loss": 0.0004, + "num_tokens": 79082994.0, + "reward": 0.54296875, + "reward_std": 0.0390625, + "rewards/equation_reward_func/mean": 0.109375, + "rewards/equation_reward_func/std": 0.31333550810813904, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8782222222222222, + "grad_norm": 0.40926220755087916, + "kl": 0.460205078125, + "learning_rate": 5.070831694623135e-09, + "loss": 0.0005, + "num_tokens": 79233062.0, + "reward": 0.55078125, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.109375, + "rewards/equation_reward_func/std": 0.31333550810813904, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8817777777777778, + "grad_norm": 0.3715644209620836, + "kl": 0.443359375, + "learning_rate": 4.786124300985822e-09, + "loss": 0.0004, + "num_tokens": 79383142.0, + "reward": 0.5859375, + "reward_std": 0.015625, + "rewards/equation_reward_func/mean": 0.1796875, + "rewards/equation_reward_func/std": 0.3854354918003082, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8853333333333333, + "grad_norm": 0.41303999956467335, + "kl": 0.425048828125, + "learning_rate": 4.509564899084328e-09, + "loss": 0.0004, + "num_tokens": 79533302.0, + "reward": 0.51953125, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.0546875, + "rewards/equation_reward_func/std": 0.22826264798641205, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8888888888888888, + "grad_norm": 0.34711935286318774, + "kl": 0.502685546875, + "learning_rate": 4.241162678462806e-09, + "loss": 0.0005, + "num_tokens": 79683382.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/equation_reward_func/mean": 0.0, + "rewards/equation_reward_func/std": 0.0, + "rewards/format_reward_func/mean": 1.0, + "rewards/format_reward_func/std": 0.0, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8924444444444446, + "grad_norm": 1.749104726801777, + "kl": 0.57666015625, + "learning_rate": 3.9809265576176146e-09, + "loss": 0.0006, + "num_tokens": 79833494.0, + "reward": 0.5546875, + "reward_std": 0.015625, + "rewards/equation_reward_func/mean": 0.109375, + "rewards/equation_reward_func/std": 0.31333550810813904, + "rewards/format_reward_func/mean": 1.0, + "rewards/format_reward_func/std": 0.0, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.896, + "grad_norm": 0.317239752869522, + "kl": 0.460693359375, + "learning_rate": 3.7288651837012745e-09, + "loss": 0.0005, + "num_tokens": 79983606.0, + "reward": 0.5234375, + "reward_std": 0.009021097794175148, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.8995555555555557, + "grad_norm": 0.5981545675620618, + "kl": 0.4501953125, + "learning_rate": 3.4849869322348126e-09, + "loss": 0.0005, + "num_tokens": 80133666.0, + "reward": 0.5234375, + "reward_std": 0.05589609593153, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.953125, + "rewards/format_reward_func/std": 0.21220162510871887, + "step": 534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.903111111111111, + "grad_norm": 1.8987553387497587, + "kl": 0.595703125, + "learning_rate": 3.249299906829761e-09, + "loss": 0.0006, + "num_tokens": 80283778.0, + "reward": 0.5234375, + "reward_std": 0.037403859198093414, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.9066666666666667, + "grad_norm": 1.3482962026459895, + "kl": 0.652587890625, + "learning_rate": 3.0218119389186502e-09, + "loss": 0.0007, + "num_tokens": 80433874.0, + "reward": 0.57421875, + "reward_std": 0.04620979726314545, + "rewards/equation_reward_func/mean": 0.171875, + "rewards/equation_reward_func/std": 0.3787541687488556, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.9102222222222223, + "grad_norm": 0.29399766813544437, + "kl": 0.393798828125, + "learning_rate": 2.8025305874949945e-09, + "loss": 0.0004, + "num_tokens": 80583958.0, + "reward": 0.58203125, + "reward_std": 0.016833597794175148, + "rewards/equation_reward_func/mean": 0.171875, + "rewards/equation_reward_func/std": 0.3787541687488556, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.9137777777777778, + "grad_norm": 0.5162389083983054, + "kl": 0.40966796875, + "learning_rate": 2.5914631388619103e-09, + "loss": 0.0004, + "num_tokens": 80734106.0, + "reward": 0.5390625, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.1015625, + "rewards/equation_reward_func/std": 0.3032590448856354, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.9173333333333333, + "grad_norm": 0.5828591533810914, + "kl": 0.423583984375, + "learning_rate": 2.388616606390198e-09, + "loss": 0.0004, + "num_tokens": 80884218.0, + "reward": 0.5234375, + "reward_std": 0.04027109593153, + "rewards/equation_reward_func/mean": 0.078125, + "rewards/equation_reward_func/std": 0.2694226801395416, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.9208888888888889, + "grad_norm": 0.5133961922979738, + "kl": 0.511962890625, + "learning_rate": 2.193997730285141e-09, + "loss": 0.0005, + "num_tokens": 81034370.0, + "reward": 0.5390625, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.1015625, + "rewards/equation_reward_func/std": 0.3032590448856354, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.9244444444444444, + "grad_norm": 0.6682775620951583, + "kl": 0.51123046875, + "learning_rate": 2.0076129773627103e-09, + "loss": 0.0005, + "num_tokens": 81184450.0, + "reward": 0.5, + "reward_std": 0.05589609593153, + "rewards/equation_reward_func/mean": 0.03125, + "rewards/equation_reward_func/std": 0.1746762990951538, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.928, + "grad_norm": 0.58701555865206, + "kl": 0.44287109375, + "learning_rate": 1.8294685408345167e-09, + "loss": 0.0004, + "num_tokens": 81334506.0, + "reward": 0.5625, + "reward_std": 0.046875, + "rewards/equation_reward_func/mean": 0.140625, + "rewards/equation_reward_func/std": 0.3490002751350403, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.9315555555555557, + "grad_norm": 0.5226655218446554, + "kl": 0.406005859375, + "learning_rate": 1.6595703401020844e-09, + "loss": 0.0004, + "num_tokens": 81484590.0, + "reward": 0.55078125, + "reward_std": 0.05050079524517059, + "rewards/equation_reward_func/mean": 0.1171875, + "rewards/equation_reward_func/std": 0.322907418012619, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.935111111111111, + "grad_norm": 0.44785902127312094, + "kl": 0.45068359375, + "learning_rate": 1.497924020560204e-09, + "loss": 0.0005, + "num_tokens": 81634682.0, + "reward": 0.51171875, + "reward_std": 0.030584799125790596, + "rewards/equation_reward_func/mean": 0.046875, + "rewards/equation_reward_func/std": 0.21220162510871887, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.9386666666666668, + "grad_norm": 1.2271943789655686, + "kl": 0.521728515625, + "learning_rate": 1.3445349534093598e-09, + "loss": 0.0005, + "num_tokens": 81784818.0, + "reward": 0.53515625, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.942222222222222, + "grad_norm": 0.34440149179608015, + "kl": 0.428955078125, + "learning_rate": 1.199408235477123e-09, + "loss": 0.0004, + "num_tokens": 81934862.0, + "reward": 0.5390625, + "reward_std": 0.024646097794175148, + "rewards/equation_reward_func/mean": 0.1015625, + "rewards/equation_reward_func/std": 0.3032590448856354, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.9457777777777778, + "grad_norm": 0.5299180419650383, + "kl": 0.398681640625, + "learning_rate": 1.0625486890488978e-09, + "loss": 0.0004, + "num_tokens": 82084958.0, + "reward": 0.5078125, + "reward_std": 0.04027109593153, + "rewards/equation_reward_func/mean": 0.046875, + "rewards/equation_reward_func/std": 0.21220162510871887, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.9493333333333334, + "grad_norm": 0.7383076800700398, + "kl": 0.576171875, + "learning_rate": 9.339608617077165e-10, + "loss": 0.0006, + "num_tokens": 82234994.0, + "reward": 0.515625, + "reward_std": 0.046875, + "rewards/equation_reward_func/mean": 0.0703125, + "rewards/equation_reward_func/std": 0.2566775679588318, + "rewards/format_reward_func/mean": 0.9609375, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.952888888888889, + "grad_norm": 2.413667288202016, + "kl": 0.955322265625, + "learning_rate": 8.136490261830553e-10, + "loss": 0.001, + "num_tokens": 82385046.0, + "reward": 0.5078125, + "reward_std": 0.046875, + "rewards/equation_reward_func/mean": 0.046875, + "rewards/equation_reward_func/std": 0.21220162510871887, + "rewards/format_reward_func/mean": 0.96875, + "rewards/format_reward_func/std": 0.1746762990951538, + "step": 549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.9564444444444444, + "grad_norm": 0.621725268878221, + "kl": 0.462890625, + "learning_rate": 7.016171802088633e-10, + "loss": 0.0005, + "num_tokens": 82535218.0, + "reward": 0.5390625, + "reward_std": 0.049292195588350296, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.96, + "grad_norm": 0.23655919159399819, + "kl": 0.40673828125, + "learning_rate": 5.978690463908087e-10, + "loss": 0.0004, + "num_tokens": 82685354.0, + "reward": 0.55859375, + "reward_std": 0.0078125, + "rewards/equation_reward_func/mean": 0.125, + "rewards/equation_reward_func/std": 0.3320184051990509, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.9635555555555557, + "grad_norm": 0.47785457137586707, + "kl": 0.448974609375, + "learning_rate": 5.024080720824608e-10, + "loss": 0.0004, + "num_tokens": 82835570.0, + "reward": 0.51953125, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.0546875, + "rewards/equation_reward_func/std": 0.22826264798641205, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.967111111111111, + "grad_norm": 0.346206288127188, + "kl": 0.446533203125, + "learning_rate": 4.152374292708538e-10, + "loss": 0.0004, + "num_tokens": 82985646.0, + "reward": 0.51953125, + "reward_std": 0.025854695588350296, + "rewards/equation_reward_func/mean": 0.0390625, + "rewards/equation_reward_func/std": 0.194504976272583, + "rewards/format_reward_func/mean": 1.0, + "rewards/format_reward_func/std": 0.0, + "step": 553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.9706666666666668, + "grad_norm": 0.3832712574599466, + "kl": 0.40869140625, + "learning_rate": 3.363600144710155e-10, + "loss": 0.0004, + "num_tokens": 83135678.0, + "reward": 0.55078125, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.109375, + "rewards/equation_reward_func/std": 0.31333550810813904, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.974222222222222, + "grad_norm": 0.24281220847577703, + "kl": 0.391357421875, + "learning_rate": 2.6577844862973877e-10, + "loss": 0.0004, + "num_tokens": 83285698.0, + "reward": 0.52734375, + "reward_std": 0.0078125, + "rewards/equation_reward_func/mean": 0.0625, + "rewards/equation_reward_func/std": 0.24301259219646454, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.9777777777777779, + "grad_norm": 0.9582617582460151, + "kl": 0.546142578125, + "learning_rate": 2.0349507703851243e-10, + "loss": 0.0005, + "num_tokens": 83435778.0, + "reward": 0.57421875, + "reward_std": 0.03245859593153, + "rewards/equation_reward_func/mean": 0.1640625, + "rewards/equation_reward_func/std": 0.371787428855896, + "rewards/format_reward_func/mean": 0.984375, + "rewards/format_reward_func/std": 0.12450689822435379, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.9813333333333332, + "grad_norm": 0.35323783838354617, + "kl": 0.42431640625, + "learning_rate": 1.4951196925561127e-10, + "loss": 0.0004, + "num_tokens": 83585798.0, + "reward": 0.54296875, + "reward_std": 0.0234375, + "rewards/equation_reward_func/mean": 0.09375, + "rewards/equation_reward_func/std": 0.29262590408325195, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.984888888888889, + "grad_norm": 1.4092624453276854, + "kl": 0.759521484375, + "learning_rate": 1.0383091903720665e-10, + "loss": 0.0008, + "num_tokens": 83735938.0, + "reward": 0.5625, + "reward_std": 0.046875, + "rewards/equation_reward_func/mean": 0.1484375, + "rewards/equation_reward_func/std": 0.356930136680603, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.9884444444444445, + "grad_norm": 0.7220556700564954, + "kl": 0.511474609375, + "learning_rate": 6.645344427794186e-11, + "loss": 0.0005, + "num_tokens": 83886042.0, + "reward": 0.53125, + "reward_std": 0.049292195588350296, + "rewards/equation_reward_func/mean": 0.109375, + "rewards/equation_reward_func/std": 0.31333550810813904, + "rewards/format_reward_func/mean": 0.953125, + "rewards/format_reward_func/std": 0.21220162510871887, + "step": 559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.992, + "grad_norm": 0.3915357093577265, + "kl": 0.454833984375, + "learning_rate": 3.738078696036151e-11, + "loss": 0.0005, + "num_tokens": 84036218.0, + "reward": 0.5234375, + "reward_std": 0.015625, + "rewards/equation_reward_func/mean": 0.0546875, + "rewards/equation_reward_func/std": 0.22826264798641205, + "rewards/format_reward_func/mean": 0.9921875, + "rewards/format_reward_func/std": 0.0883883461356163, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.9955555555555555, + "grad_norm": 0.4708155060402308, + "kl": 0.412353515625, + "learning_rate": 1.6613913113694423e-11, + "loss": 0.0004, + "num_tokens": 84186286.0, + "reward": 0.578125, + "reward_std": 0.03125, + "rewards/equation_reward_func/mean": 0.1796875, + "rewards/equation_reward_func/std": 0.3854354918003082, + "rewards/format_reward_func/mean": 0.9765625, + "rewards/format_reward_func/std": 0.15188287198543549, + "step": 561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "epoch": 1.999111111111111, + "grad_norm": 0.7953265607446423, + "kl": 0.6337890625, + "learning_rate": 4.153512781768231e-12, + "loss": 0.0006, + "num_tokens": 84336314.0, + "reward": 0.5078125, + "reward_std": 0.046875, + "rewards/equation_reward_func/mean": 0.0546875, + "rewards/equation_reward_func/std": 0.22826264798641205, + "rewards/format_reward_func/mean": 0.9609375, + "rewards/format_reward_func/std": 0.194504976272583, + "step": 562 + }, + { + "epoch": 1.999111111111111, + "step": 562, + "total_flos": 0.0, + "train_loss": 0.010520504666011201, + "train_runtime": 15057.2865, + "train_samples_per_second": 1.195, + "train_steps_per_second": 0.037 + } + ], + "logging_steps": 1, + "max_steps": 562, + "num_input_tokens_seen": 84336314, + "num_train_epochs": 2, + "save_steps": 25, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}